CUDA - typical program structure

Global variables declaration

__host__
__device__
__global__
__constant__
texture

Function prototypes

__global__ void kernelOne(...)
__device__ / __host__ float handyFunction(...)

Main()

allocate memory space on the device - cudaMalloc(&d_GlobalVarPtr, bytes)
transfer data from host to device - cudaMemCpy(d_GlobalVarPtr, h_GlobalVa...)
execution configuration setup
kernel call - kernelOne<<<execution configuration>>>(args ...);
transfer results from device to host - cudaMemCpy(h_GlobalVarPtr, d_Global...)
free memory space on device - cudaFree(d_GlobalVarPtr);

Kernel - void kernelOne(type args,...)

variables declaration:
- __shared__
- automatic variables transparently assigned to registers or local memory
__syncthreads()...

__global__ void cudaKernel(float *a, float *b, float *c)
{
    int tID = some_mean_to_identify_thread_index;
    c[tID] = a[tID] + b[tID];
}



int main(int argc, char* argv[]) {
    /* Allocate and initialize vector a,b and c on both CPU and GPU */
    /* Data transfer for copying input vectors a and b on GPU. */
    cudaKernel<<<some_environment_info>>>(a, b, c);
    /* Data transfer for copying output vector c on CPU */
    /* Free memory. */
    return 0;
}

PreviousCUDA programming model NextExamples

Last updated 5 years ago

Was this helpful?