CUDA - typical program structure
Global variables declaration
__host__
__device__
__global__
__constant__
texture
Function prototypes
__global__ void kernelOne(...)
__device__ / __host__ float handyFunction(...)
Main()
allocate memory space on the device - cudaMalloc(&d_GlobalVarPtr, bytes)
transfer data from host to device - cudaMemCpy(d_GlobalVarPtr, h_GlobalVa...)
execution configuration setup
kernel call - kernelOne<<<execution configuration>>>(args ...);
transfer results from device to host - cudaMemCpy(h_GlobalVarPtr, d_Global...)
free memory space on device - cudaFree(d_GlobalVarPtr);
Kernel - void kernelOne(type args,...)
variables declaration:
__shared__
automatic variables transparently assigned to registers or local memory
__syncthreads()...
__global__ void cudaKernel(float *a, float *b, float *c)
{
int tID = some_mean_to_identify_thread_index;
c[tID] = a[tID] + b[tID];
}
int main(int argc, char* argv[]) {
/* Allocate and initialize vector a,b and c on both CPU and GPU */
/* Data transfer for copying input vectors a and b on GPU. */
cudaKernel<<<some_environment_info>>>(a, b, c);
/* Data transfer for copying output vector c on CPU */
/* Free memory. */
return 0;
}
Last updated
Was this helpful?