Computing Memoir: CUDA Study - Pattern Inspection Algorithm #1

There is an image processing algorithm that I want to see how fast can it be implemented. It is the first step of a common pattern inspection algorithm used in the LCD or other regular patterns.

The algorithm is simply calculate the difference between pitch distance pixels. Here is a horizontal pitch comparison version in plain C with a fixed-point floating calculation.

uint8_t PlainC_HorizontalPitch( 
 uint8_t center,
 uint8_t eastLeft, uint8_t eastRight, uint8_t westLeft, uint8_t westRight, 
 float pitch )
{
 uint16_t toFloor = (uint16_t)( ((pitch - (int)pitch) * 0xFF)+0.5f);
 uint16_t toCeiling = 0xFF - toFloor;

 uint16_t east = toFloor * eastLeft + toCeiling * eastRight;
 uint16_t west = toFloor * westRight + toCeiling * westLeft;

 east = east >> 1;   // fx9.16
 west = west >> 1; 

 uint16_t diff1, diff2, diff;
 uint16_t c2 = center << 8;   // fx9.16   : this is 2c

 diff1 = max( 0,  c2 - east - west );
 diff2 = max( 0,  east + west - c2 );
 diff = ( diff1 + diff2 ) >> 8;   // fx8.8   : 

 return (uint8_t)diff;
}

Now this is modified to run on the CUDA kernel like below. I really don't expect that it performs well. This version is just a very crude verions and want to see where is the bottle neck and make a plan on optimization.

__global__ 
void KernelPitch0( 
 unsigned char* d_src, unsigned char* d_dst, 
 const Area* imgSize, 
 const Region* roi,
 const PitchContext* pc )
{
 int x = roi->Left + blockIdx.x * blockDim.x + threadIdx.x;
 int y = roi->Top + blockIdx.y * blockDim.y + threadIdx.y;
 int idx = x + y * imgSize->Width;

 bool isInBound = x < roi->Right && y < roi->Bottom;

 if( isInBound )
 {
  float east = 
   (float)d_src[ idx - pc->IntegerPitch - 1 ] * pc->ToCeiling +
   (float)d_src[ idx - pc->IntegerPitch ] * pc->ToFloor;
  float west = 
   (float)d_src[ idx + pc->IntegerPitch ] * pc->ToFloor + 
   (float)d_src[ idx + pc->IntegerPitch + 1 ] * pc->ToCeiling;

  float diff = ( (float)d_src[idx] * 2.0f  - ( east + west ) ) / 2.0f;

  d_dst[ idx ] = (unsigned char)( diff >= 0 ? diff : -diff  );
 }
}

Kernel is launched as below code snippet.

void CUDA_Pitch0( 
 const unsigned char *h_src, unsigned char *h_dst, 
 int width, int height, 
 int roiLeft, int roiTop, int roiRight, int roiBottom,
 float horPitch, float verPitch )
{
 using namespace boost;

 int imgSize = width*height;

 DeviceByteImagePtr devSrc( new DeviceByteImage( width, height ) );
 DeviceByteImagePtr devDst( new DeviceByteImage( width, height ) );

 dim3 grids( width / 32, height / 32 );
 dim3 threads( 32, 32 );

 HANDLE_ERROR( cudaMemcpy( devSrc->GetPixelPtr(), h_src, imgSize, cudaMemcpyHostToDevice ) );

 shared_ptr devImageDim = CreateSmartDeviceMemoryPointer();
 shared_ptr devROI = CreateSmartDeviceMemoryPointer();
 shared_ptr devHorPitch = CreateSmartDeviceMemoryPointer();
 
 {
  Area imageDim( width, height );
  HANDLE_ERROR( cudaMemcpy( devImageDim.get(), &imageDim, sizeof(Area), cudaMemcpyHostToDevice ) );

  Region roi( roiLeft, roiTop, roiRight, roiBottom );
  HANDLE_ERROR( cudaMemcpy( devROI.get(), &roi, sizeof(Region), cudaMemcpyHostToDevice ) );

  PitchContext pc;

  pc.Pitch = horPitch;
  pc.IntegerPitch = (int)horPitch;
  pc.ToFloor = horPitch - pc.IntegerPitch;
  pc.ToCeiling = 1.0f - pc.ToFloor;
  HANDLE_ERROR( cudaMemcpy( devHorPitch.get(), &pc, sizeof(PitchContext), cudaMemcpyHostToDevice ) );
 }

 KernelPitch0<<< grids, threads >>>( 
   devSrc->GetPixelPtr(), devDst->GetPixelPtr(), 
   devImageDim.get(), devROI.get(), 
   devHorPitch.get()
  );
  if ( cudaGetLastError() != cudaSuccess ) 
  {
   cerr << "Launch Error" << endl;
   return;
  }

 HANDLE_ERROR( cudaMemcpy( h_dst, devDst->GetPixelPtr(), imgSize, cudaMemcpyDeviceToHost ) );
}

When 8192x512 ( 4MB ) image, it takes around 9291 usec which give me 428.3 MB/sec. The environment is as below.

Geforce GT 640, CUDA 5.0
Windows7 64 bits, Pentium G2120 @ 3.10GHz, 4GB RAM
VS 2008

Looking into the profiling, the memory copy from host to device for source takes 1384 usec. And actual kernel takes 4081 usec. And then memory copy from devicde to host takes another 1384 usec. Refer below image. The first cudaMemcpy is the copy from host to device. The second cudaMemcpy is actually time for kernel and time for copy from device to host.

Computing Memoir

Tuesday, January 29, 2013

CUDA Study - Pattern Inspection Algorithm #1

No comments:

Blog Archive

About Me