int main(int argc, char **argv){
printf("%s Starting...\n",zrgv[0]);
int dev = 0;
cudaDeviceProp deviceProp;
CHECK(cudaGetDeviceProperties(&deviceProp, dev));
printf("Using Device %d: %s\n",dev, deviceProp.name);
CHECK(cudaSetDevice(dev));
int nx = 1<<14;
int ny = 1<<14;
int nxy = nx*ny;
int nBytes = nxy *sizeof(float);
printf("Matrix size: nx %d ny %d\n",nx,ny);
float *h_A, *h_B, *hostRef, *gpuRef;
h_A = (float *)malloc(nBytes);
h_B = (float *)malloc(nBytes);
hostRef = (float *)malloc(nBytes);
gpuRef = (float *)malloc(nBytes);
double iStart = cpuSecond();
initialData (h_A, nxy);
initialData (h_B, nxy);
double iElaps = cpuSecond() - iStart;
memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);
iStart = cpuSecond();
sumMatrixOnHost (h_A, h_B, hostRef, nx,ny);
iElaps = cpuSecond() - iStart;
float *d_MatA, *d_MatB, *d_MatC;
cudaMalloc((void **)&d_MatA,nBytes);
cudaMalloc((void **)&d_MatB,nBytes);
cudaMalloc((void **)&d_MatC,nBytes);
cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice);
int dimx = 32;
int dimy = 32;
dim3 block(dimx,dimy);
dim3 grid((nx+block.x-1)/block.x,(ny+block.y-1)/block.y);
iStart = cpuSecond();
sumMatrixOnGPU2D<<<grid,block>>>(d_MatA, d_MatB, d_MatC, nx,ny);
cudaDeviceSynchronize();
iElaps = cpuSecond() - iStart;
printf("sumMatrixOnGPU2D<<<(%d,%d),(%d,%d)>>> elapsed %f sec\n",grid.x, grid.y, block.x,block.y,iElaps);
cudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost);
checkResult(hostRef, gpuRef,nxy);
cudaFree(d_MatA);
cudaFree(d_MatB);
cudaFree(d_MatC);
free(h_A);
free(h_B);
free(hostRef);
free(gpuRef);
cudaDeviceReset();
return (0);
}