Shared memory sum

Example code

Example code

sumi=∑j=0N(a∗j+b) sum_i = {\sum^N_{j=0}}( a*j + b)

/* CUDA addition */

#include <stdio.h>
#include <math.h>
#include <sys/time.h>

#define NUMPOINTS 500000 //number of elements in sum array
#define BLOCK 48
#define GRID 1024

#define NUM_ITERS 150000 // number of additions pefromed foe each element of the array
#define SMEM_KERNEL 0 //1

void cudaCheck(cudaError_t error, const char *function, int line);

#if SMEM_KERNEL == 1
//Shared memory version of CUDA kernel for sum
__global__ void additionKernelSMem(int numPoints, float *sum, float a, float b) 
{
    __shared__ float sumShared[BLOCK];
    
    //total number of threads in the grid
    int gridThreads = gridDim.x * blockDim.x;
    
    //thread id for this thread
    int idx = blockIdx.x *  blockDim.x + threadIdx.x;
    
    for(int i = idx; i < numPoints; i+=gridThreads) 
    {
        //initialise the result array (global mem) and the intermediate array (shared men)
        sum[i] = 0.0;
        sumShared[threadIdx.x] = 0.0;
        
        //compute the sum
        for(int j = 0; j < NUM_ITERS; j++) 
        {
            sumShared[i] +=(float) (a*j + b);
        }
        
        //store the result in global memory
        sum[i] = sumShared[i];   
    }
}

#else
//Global memory only CUDA kernel for sum
__global__ void additionKernel(int numPoints, float *sum, float a, float b)
{
    //threads in grid
    int gridThreads = gridDim.x * blockDim.x;
    
    //thread id for this thread
    int idx = blockIdx.x *  blockDim.x + threadIdx.x;
    
    for(int i = idx; i < numPoints; i+=gridThreads) 
    {
        //initialise the result array (global mem) and the intermediate array (shared men)
        sum[i] = 0.0;
        
        //compute the sum
        for(int j = 0; j < NUM_ITERS; j++) 
        {
            sum[i] +=(float) (a*j + b);
        }
    }
}
#endif

//host sum
void addition(int numPoints, float *sum, float a, float b)
{
    for (int i = 0; i < numPoints; i++) 
    {
        sum[i] = 0.0;
        
        for (int j=0; j<NUM_ITERS; j++) 
        {
            sum[i] += (float) (a * j + b);
        }
    }    
}

int main(int argc, char** argv) 
{
    cudaError_t cudaError;
    
    int numPoints;
    float *sum_h, *sum_d, *sumHost;
    float hostTime, deviceTime;
    float a = 1.01;
    float b = 0.01;
    
    //time struct for gettimeofday()
    struct timeval start, end;
    
    numPoints = (int) NUMPOINTS;
    
    size_t memSize = numPoints * sizeof(float);
    sum_h = (float*) malloc(memSize);
    sumHost = (float*) malloc(memSize);
    
    if (sum_h == NULL || sumHost == NULL) {
        printf("\nError in host memory allocation");
        exit(1);
    }
    
    cudaError = cudaMalloc(&sum_d, memSize);
    cudaCheck(cudaError, __FUNCTION__, __LINE__);
    
    //define grid and block size
    dim3 dimGrid(GRID,1,1);
    dim3 dimBlock(BLOCK,1);
    
    // start timer
    gettimeofday(&start, NULL);
    
    //call kernel
#if SMEM_KERNEL == 1
    additionKernelSMem<<<dimGrid,dimGrid>>>(numPoints,sum_d,a,b);
#else
    additionKernel<<<dimGrid,dimGrid>>>(numPoints,sum_d,a,b);
#endif

    //wait for completion
    cudaThreadSynchronize();
    
    cudaCheck(cudaGetLastError(), __FUNCTION__, _)__LINE__);
    
    //stop timer
    gettimeofday(&end, NULL);
    
    deviceTime = (end.tv_sec - start.tv_sec)*1000.0 +
                    (end.tv_usec - start.tv_usec);
                    
    cudaError = cudaMemcpy(sum_h,sum_d, memSize, cudaMemcpyDeviceToHost);
    cudaCheck(cudaError, __FUNCTION__, __LINE__);
    
    
    //HOST
    // start timer
    gettimeofday(&start, NULL);
    addition(numPoints,sumHost,a,b);
    gettimeofday(&end, NULL);
    hostTime = (end.tv_sec - start.tv_sec)*1000.0 +
                    (end.tv_usec - start.tv_usec);
    
    
    //verify
    for (int i = 0; i<numPoints;; i++) {
        float normErr = fabs((sumHost[i] - sum_h[i])/sumHost[i]);
        if (normErr > 0.01) {
            printf("\n Error: Device and host data differ by more than 1 percent at element %1\n",i);
            exit(1);
        } 
    }
    
    printf("\n CUDA kernel completed successfully.");
    printf("\n Device time (kernel only) = %f [ms]", deviceTime);
    printf("\n Host time = %f [ms]", hostTime);
    
    //free mem
    cudaError = cudaFree(sum_d);
    cudaCheck(cudaError, __FUNCTION__, __LINE__);
    
    free(sum_h);
    free(sumHost);
    
    return 0;
     
}


void cudaCheck(cudaError_t error, const char *function, int line){
    if (error != cudaSuccess){
        fprintf(stderr, "ERROR in function %s at line %i: %s\n",
                function, line, cudaGetLastError(error));
        exit(1);
    } 
}

Last updated

Was this helpful?