Shared memory sum
Example code
Example code
/* CUDA addition */
#include <stdio.h>
#include <math.h>
#include <sys/time.h>
#define NUMPOINTS 500000 //number of elements in sum array
#define BLOCK 48
#define GRID 1024
#define NUM_ITERS 150000 // number of additions pefromed foe each element of the array
#define SMEM_KERNEL 0 //1
void cudaCheck(cudaError_t error, const char *function, int line);
#if SMEM_KERNEL == 1
//Shared memory version of CUDA kernel for sum
__global__ void additionKernelSMem(int numPoints, float *sum, float a, float b)
{
__shared__ float sumShared[BLOCK];
//total number of threads in the grid
int gridThreads = gridDim.x * blockDim.x;
//thread id for this thread
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for(int i = idx; i < numPoints; i+=gridThreads)
{
//initialise the result array (global mem) and the intermediate array (shared men)
sum[i] = 0.0;
sumShared[threadIdx.x] = 0.0;
//compute the sum
for(int j = 0; j < NUM_ITERS; j++)
{
sumShared[i] +=(float) (a*j + b);
}
//store the result in global memory
sum[i] = sumShared[i];
}
}
#else
//Global memory only CUDA kernel for sum
__global__ void additionKernel(int numPoints, float *sum, float a, float b)
{
//threads in grid
int gridThreads = gridDim.x * blockDim.x;
//thread id for this thread
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for(int i = idx; i < numPoints; i+=gridThreads)
{
//initialise the result array (global mem) and the intermediate array (shared men)
sum[i] = 0.0;
//compute the sum
for(int j = 0; j < NUM_ITERS; j++)
{
sum[i] +=(float) (a*j + b);
}
}
}
#endif
//host sum
void addition(int numPoints, float *sum, float a, float b)
{
for (int i = 0; i < numPoints; i++)
{
sum[i] = 0.0;
for (int j=0; j<NUM_ITERS; j++)
{
sum[i] += (float) (a * j + b);
}
}
}
int main(int argc, char** argv)
{
cudaError_t cudaError;
int numPoints;
float *sum_h, *sum_d, *sumHost;
float hostTime, deviceTime;
float a = 1.01;
float b = 0.01;
//time struct for gettimeofday()
struct timeval start, end;
numPoints = (int) NUMPOINTS;
size_t memSize = numPoints * sizeof(float);
sum_h = (float*) malloc(memSize);
sumHost = (float*) malloc(memSize);
if (sum_h == NULL || sumHost == NULL) {
printf("\nError in host memory allocation");
exit(1);
}
cudaError = cudaMalloc(&sum_d, memSize);
cudaCheck(cudaError, __FUNCTION__, __LINE__);
//define grid and block size
dim3 dimGrid(GRID,1,1);
dim3 dimBlock(BLOCK,1);
// start timer
gettimeofday(&start, NULL);
//call kernel
#if SMEM_KERNEL == 1
additionKernelSMem<<<dimGrid,dimGrid>>>(numPoints,sum_d,a,b);
#else
additionKernel<<<dimGrid,dimGrid>>>(numPoints,sum_d,a,b);
#endif
//wait for completion
cudaThreadSynchronize();
cudaCheck(cudaGetLastError(), __FUNCTION__, _)__LINE__);
//stop timer
gettimeofday(&end, NULL);
deviceTime = (end.tv_sec - start.tv_sec)*1000.0 +
(end.tv_usec - start.tv_usec);
cudaError = cudaMemcpy(sum_h,sum_d, memSize, cudaMemcpyDeviceToHost);
cudaCheck(cudaError, __FUNCTION__, __LINE__);
//HOST
// start timer
gettimeofday(&start, NULL);
addition(numPoints,sumHost,a,b);
gettimeofday(&end, NULL);
hostTime = (end.tv_sec - start.tv_sec)*1000.0 +
(end.tv_usec - start.tv_usec);
//verify
for (int i = 0; i<numPoints;; i++) {
float normErr = fabs((sumHost[i] - sum_h[i])/sumHost[i]);
if (normErr > 0.01) {
printf("\n Error: Device and host data differ by more than 1 percent at element %1\n",i);
exit(1);
}
}
printf("\n CUDA kernel completed successfully.");
printf("\n Device time (kernel only) = %f [ms]", deviceTime);
printf("\n Host time = %f [ms]", hostTime);
//free mem
cudaError = cudaFree(sum_d);
cudaCheck(cudaError, __FUNCTION__, __LINE__);
free(sum_h);
free(sumHost);
return 0;
}
void cudaCheck(cudaError_t error, const char *function, int line){
if (error != cudaSuccess){
fprintf(stderr, "ERROR in function %s at line %i: %s\n",
function, line, cudaGetLastError(error));
exit(1);
}
}
Last updated
Was this helpful?