/* CUDA vector multiplication */
__global__ void vector_multi( float *A, float *B, float *C) {
int idx = threadIdx.x;
C[idx] = A[idx] * B[idx];
}
/* Initialize vectors A & B with random numbers */
void init_vector( float **vec, int size) {
float *tmp;
(*vec) = (float *) malloc( size * sizeof(float));
tmp = (*vec);
for (int i=0; i<size; i++){
tmp[i] = 1 + (float)(100.0 * rand()/( RAND_MAX + 1.0 ));
}
}
int main(void) {
cudaDeviceProp prop;
int my_device = -1, n_threads = 0;
size_t vec_mem_size;
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
/* set maximum number of threads will be allocated per block */
my_device = cudaGetDevice( &my_device);
cudaGetDeviceProperties(&prop, my_device);
n_threads = prop.maxThreadsPerBlock;
vec_mem_size = n_threads * sizeof(float);
printf("Program running on %d threads.", n_threads);
/* Initialize vectors A,B,C on CPU - host */
init_vector(&h_A, n_threads);
init_vector(&h_B, n_threads);
h_C = (float *) malloc(vec_mem_size);
memset(h_C, 0, vec_mem_size);
/* Initialize vectors A,B,C on GPU - device */
cudaMalloc(&d_A, vec_mem_size);
cudaMalloc(&d_B, vec_mem_size);
cudaMalloc(&d_C, vec_mem_size);
cudaMemcpy(d_A,h_A,vec_mem_size,cudaMemcpyHostToDevice);
cudaMemcpy(d_B,h_B,vec_mem_size,cudaMemcpyHostToDevice);
cudaMemcpy(d_C,0,vec_mem_size);
/* Execute vector_multi on GPU */
vector_multi<<< 1, n_threads >>>(d_A, d_B, d_C);
/* copy GPU result back to CPU */
cudaMemcpy(h_C, d_C, vec_mem_size, cudaMemcpyDeviceToHost );
/* free both CPU and GPU memory */
free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_B);
return 0;
}