CUDA编译与执行

现在把所有的代码放在一个文件名为sumArraysOnGPU-small-case.cu的文件中，如代码清单2-4所示。

代码清单2-4 基于GPU的向量加法（sumArraysOnGPU-small-case.cu）

#include <cuda_runtime.h>
#include <stdio.h>
/*
#define CHECK(call)
{
    const cudaError_t error = call;
    if(error != cudaSuccess)
    {
        printf("Error:%s:%d, ", __FILE__, __LINE__);
        printf("code:%d, reason: %s\n", error, cudaGetErrorString(error));
    }
}
*/
void checkResult(float *hostRef, float *gpuRef, const int N){
    double epsilon = 1.0E-8;
    int match = 1;
    for(int i = 0 ;i < N; i++){
        if(abs(hostRef[i] - gpuRef[i]) > epsilon){
            match = 0;
            printf("Arrays do not match!\n");
            printf("host %5.2f gpu %5.2f at current %d\n",hostRef[i],gpuRef[i],i);
            break;
        }
    }
    if(match) printf("Arrays match.\n\n");
    return;
}

void initialData(float *ip,int size){
    //generate different seed for random number
    time_t t;
    srand((unsigned) time(&t));
    
    for(int i = 0; i<size;i++){
        ip[i] = (float)(rand() & 0xFF) /10.0f;
    }
}

void sumArraysOnHost(float *A,float *B, float *C){
    int i = threadIdx.x;
    C[i] = A[i] + B[i];
}

__gloal__ void sumArraysOnGPU(float *A, float *B, float *C){
    int i = threadIdx.x;
    C[i] = A[i] + B[i];
}

int main(int argc,char **argv){
    printf("%s Starting...\n",argv[0]);
    
    //set up device
    int dev = 0;
    cudaSetDevice(dev);
    
    //set up data size of vectors
    int nElem = 32;
    printf("Vector size %d\n",nElem);
    
    //malloc host memory
    size_t nBytes = nElem * sizeof(float);
    
    float *h_A, *h_B, *hostRef, *gpuRef;
    h_A = (float*)malloc(nBytes);
    h_B = (float*)malloc(nBytes);
    hostRef = (float*)malloc(nBytes);
    gpuRef = (float*)malloc(nBytes);
    
    //initialize data at host side
    initialData(h_A, nElem);
    initialData(h_B, nElem);
    
    memset(hostRef, 0 ,nBytes);
    memset(gpuRef, 0 ,nBytes);
    
    //malloc device global memory
    float *d_A, *d_B, *d_C;
    cudaMalloc((float**)&d_A, nBytes);
    cudaMalloc((float**)&d_B, nBytes);
    cudaMalloc((float**)&d_C, nBytes);
    
    //transfer data from host to device
    cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostTodevice);
    cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostTodevice);
    
    //invoke kernel at host side
    dim3 block(nElem);
    dim3 grid(nElem/block.x);
    
    sumArraysOnGPU<<<grid,block>>>(d_A, d_B, d_C);
    pritnf("Execution configuration <<<%d, %d>>>\n",grid.x,block.x);
    
    //copy kernel result back to host side
    cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
    
    //add vector at host side for result checks
    sumArraysOnHost(h_A, h_B, hostRef, nElem);
    
    //check device results
    checkResult(hostRef, gpuRef, nElem);
    
    //free device global memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    
    //free host memory
    free(h_A);
    free(h_B);
    free(hostRef);
    free(gpuRef);
    
    return(0);
}

在这段代码中，向量大小被设置为32，如下所示：

1	`int nElem = 32;`

执行配置被放入一个块内，其中包含32个元素：

1 2	`dim3 block(nElem); dim3 grid(nElem/block.x);`

使用以下命令编译和执行该带啊吗：

1 2	`nvcc sumArraysOnGPU-small-case.cu -o addvector ./addvector`

系统报告如下：

./addvector Starting...
Vector size 32
Execution configuration <<<1,32>>>
Arrays match.

如果你将执行配置重新定义为32个块，每个块只有一个元素，如下所示；

1 2	`dim3 block(1); dim3 grid(nElem);`

那么就需要在代码清单2-4中对核函数sumArraysOnGPU进行修改：

1	`用int i = threadIdx.x; 替换int i = blockIdx.x;`

一般情况下，可以基于给定的一维网格和块的信息来计算全局数据访问的唯一索引：

__gloal__ void sumArraysOnGPU(float *A, float *B, float *C){
    int i = blockIdx.x * blockDim.x * threadIdx.x;
    C[i] = A[i] + B[i];
}

你需要确保一般情况下进行更改所产生结果的正确性。

参考资料

CUDA C编程权威指南程润伟，Max Grossman(美)，Ty Mckercher

HPC

CUDA

本博客所有文章除特别声明外，均采用 CC BY-SA 4.0 协议，转载请注明出处！

线性回归模型上一篇

CUDA处理错误下一篇