host_x = (float*)malloc( n*sizeof(float) );
host_y = (float*)malloc( n*sizeof(float) );
cudaMalloc( &dev_x, n*sizeof(float) );
cudaMalloc( &dev_y, n*sizeof(float) );
for( int i = 0; i < 1024; ++i )
host_x[i] = (float) i;
/* fill host_x[i] with data here */
cudaMemcpy( dev_x, host_x, n*sizeof(float), cudaMemcpyHostToDevice );
/* launch 1 thread per vector-element, 256 threads per block */
bk = (int)( n / 256 );
vcos<<<bk,256>>>( n, dev_x, dev_y );
cudaMemcpy( host_y, dev_y, n*sizeof(float), cudaMemcpyDeviceToHost );
/* host_y now contains cos(x) data */
free(host_x);
free(host_y);
cudaFree(dev_x);
cudaFree(dev_y);