__global__
void RGBTOGrayScale(unsigned char *inRGBImage, unsigned char *outGrayImage, int srcW, int srcH )
{
// using tiles of shared memory to sore image data for current block
__shared__ unsigned char shInImage[16*16*4]; // threads per block is 16*16 and each pixel has 4 attributes R, G, B, A.
__shared__ unsigned char shoutImage[16*16];
int x = blockIdx.x * blockDim.x + threadIdx.x; //taking index along srcW in source image.
int y = blockIdx.y * blockDim.y + threadIdx.y; //taking index along srcH in source image.
int tidx = threadIdx.x;//taking thread index along x direction within tile.
int tidy = threadIdx.y;//taking thread index along y direction within tile.
int shIndex = (tidy * 16 + tidx)*4;
int srcIndex = (y * srcW + x)*4;
// taking active portation of image from global to shared memory.
shInImage[shIndex + 0] = inRGBImage[ srcIndex + 0];
shInImage[shIndex + 1] = inRGBImage[ srcIndex + 1];
shInImage[shIndex + 2] = inRGBImage[ srcIndex + 2];
shInImage[shIndex + 3] = inRGBImage[ srcIndex + 3];
__syncthreads(); // barrier synchronization for all threads reading is completed.
if( ( x < srcW ) && ( y < srcH ))
{
shoutImage[tidy * 16 + tidx] = (int)((shInImage[shIndex + 0] * 0.3) + (shInImage[shIndex + 1] * 0.59) + (shInImage[shIndex + 2] * 0.11));
}
__syncthreads();
//copy data back to shared to global memory.
outGrayImage[srcIndex] = shoutImage[tidy * 16 + tidx];
outGrayImage[srcIndex+1] = shoutImage[tidy * 16 + tidx];
outGrayImage[srcIndex+2] = shoutImage[tidy * 16 + tidx];
outGrayImage[srcIndex+3] = shoutImage[tidy * 16 + tidx];
}