CUDA编译与执行

CUDA编译与执行

现在把所有的代码放在一个文件名为sumArraysOnGPU-small-case.cu的文件中,如代码清单2-4所示。

代码清单2-4 基于GPU的向量加法(sumArraysOnGPU-small-case.cu)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#include <cuda_runtime.h>
#include <stdio.h>
/*
#define CHECK(call)
{
const cudaError_t error = call;
if(error != cudaSuccess)
{
printf("Error:%s:%d, ", __FILE__, __LINE__);
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error));
}
}
*/
void checkResult(float *hostRef, float *gpuRef, const int N){
double epsilon = 1.0E-8;
int match = 1;
for(int i = 0 ;i < N; i++){
if(abs(hostRef[i] - gpuRef[i]) > epsilon){
match = 0;
printf("Arrays do not match!\n");
printf("host %5.2f gpu %5.2f at current %d\n",hostRef[i],gpuRef[i],i);
break;
}
}
if(match) printf("Arrays match.\n\n");
return;
}

void initialData(float *ip,int size){
//generate different seed for random number
time_t t;
srand((unsigned) time(&t));

for(int i = 0; i<size;i++){
ip[i] = (float)(rand() & 0xFF) /10.0f;
}
}

void sumArraysOnHost(float *A,float *B, float *C){
int i = threadIdx.x;
C[i] = A[i] + B[i];
}

__gloal__ void sumArraysOnGPU(float *A, float *B, float *C){
int i = threadIdx.x;
C[i] = A[i] + B[i];
}

int main(int argc,char **argv){
printf("%s Starting...\n",argv[0]);

//set up device
int dev = 0;
cudaSetDevice(dev);

//set up data size of vectors
int nElem = 32;
printf("Vector size %d\n",nElem);

//malloc host memory
size_t nBytes = nElem * sizeof(float);

float *h_A, *h_B, *hostRef, *gpuRef;
h_A = (float*)malloc(nBytes);
h_B = (float*)malloc(nBytes);
hostRef = (float*)malloc(nBytes);
gpuRef = (float*)malloc(nBytes);

//initialize data at host side
initialData(h_A, nElem);
initialData(h_B, nElem);

memset(hostRef, 0 ,nBytes);
memset(gpuRef, 0 ,nBytes);

//malloc device global memory
float *d_A, *d_B, *d_C;
cudaMalloc((float**)&d_A, nBytes);
cudaMalloc((float**)&d_B, nBytes);
cudaMalloc((float**)&d_C, nBytes);

//transfer data from host to device
cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostTodevice);
cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostTodevice);

//invoke kernel at host side
dim3 block(nElem);
dim3 grid(nElem/block.x);

sumArraysOnGPU<<<grid,block>>>(d_A, d_B, d_C);
pritnf("Execution configuration <<<%d, %d>>>\n",grid.x,block.x);

//copy kernel result back to host side
cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);

//add vector at host side for result checks
sumArraysOnHost(h_A, h_B, hostRef, nElem);

//check device results
checkResult(hostRef, gpuRef, nElem);

//free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);

//free host memory
free(h_A);
free(h_B);
free(hostRef);
free(gpuRef);

return(0);
}

在这段代码中,向量大小被设置为32,如下所示:

1
int nElem = 32;

执行配置被放入一个块内,其中包含32个元素:

1
2
dim3 block(nElem);
dim3 grid(nElem/block.x);

使用以下命令编译和执行该带啊吗:

1
2
nvcc sumArraysOnGPU-small-case.cu -o addvector
./addvector

系统报告如下:

1
2
3
4
./addvector Starting...
Vector size 32
Execution configuration <<<1,32>>>
Arrays match.

如果你将执行配置重新定义为32个块,每个块只有一个元素,如下所示;

1
2
dim3 block(1);
dim3 grid(nElem);

那么就需要在代码清单2-4中对核函数sumArraysOnGPU进行修改:

1
int i = threadIdx.x;    替换int i = blockIdx.x;

一般情况下,可以基于给定的一维网格和块的信息来计算全局数据访问的唯一索引:

1
2
3
4
__gloal__ void sumArraysOnGPU(float *A, float *B, float *C){
int i = blockIdx.x * blockDim.x * threadIdx.x;
C[i] = A[i] + B[i];
}

你需要确保一般情况下进行更改所产生结果的正确性。

参考资料

CUDA C编程权威指南 程润伟,Max Grossman(美),Ty Mckercher


本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!