essay on programming languages, computer science, information techonlogies and all.

Wednesday, February 27, 2013

OpenCL - first pitch comparison

In here, the pitch comparsion will be tested with OpenCL from NVidia CUDA toolkit.

First, to simplify the platfrom, device, context and program creation, I make up a new class WorkBench which do all the chores. Refer below code. Note that this class rely on the reference counting of OpenCL c++ wrapper.


//
// usage :
//   Workbench task;
//   task.ConnectDevice( "GT 640" )
//       .BuildContext()
//       .BuildProgram( "OpenCLKernels.cl" );
//
class Workbench
{
public:
  Workbench() {}

  Workbench& ConnectDevice( const string& deviceName )
  {
    vector< cl::platform > platforms;  
    cl::Platform::get( &platforms );

    for( vector< cl::platform >::iterator I=platforms.begin(); I!=platforms.end(); ++I )
    {
      cl::Platform& platform( *I );
      
      vector< cl::device > devices;
      platform.getDevices( CL_DEVICE_TYPE_ALL, &devices );

      for( vector< cl::device >::iterator J=devices.begin(); J!=devices.end(); ++J )
      {
        cl::Device& device( *J );
        string discoveredName( device.getInfo< cl_device_name >() );

        if( discoveredName.find( deviceName ) != discoveredName.npos )
        {
          _Devices.push_back( device );
          return *this;
        }
      }
    }

    throw std::runtime_error( string("Cannot find the device with the name") + deviceName );
  }

  Workbench& BuildContext()
  {
    _Context = cl::Context( GetConnectedDevices() );
    return *this;
  }

  Workbench& BuildProgram( const string& sourceFile )
  {
    std::ifstream programFile( sourceFile.c_str() );
    std::string programString( 
      std::istreambuf_iterator< char >(programFile), 
      (std::istreambuf_iterator< char >()));
    cl::Program::Sources source( 
      1, 
      std::make_pair( programString.c_str(), programString.length()+1 )
      );
    _Program = cl::Program( _Context, source );
    _Program.build( GetConnectedDevices() );
    return *this;
  }

  vector< cl::Device >& GetConnectedDevices() { return _Devices; }
  cl::Context& GetContext() { return _Context; }
  cl::Program& GetProgram() { return _Program; }
    

private:
  vector< cl::Device > _Devices;
  cl::Context _Context;
  cl::Program _Program;
};

Then the host code snippet is as below
...
ByteImage src( width, height ), dst( width, height );
...
Workbench task;
task.ConnectDevice( "GT 640" )
  .BuildContext()
  .BuildProgram( "OpenCLKernels.cl" );

cl::Kernel kernelPitch0( task.GetProgram(), "Pitch0" );
cl::CommandQueue queue( task.GetContext(), task.GetConnectedDevices()[0], CL_QUEUE_PROFILING_ENABLE );
cl::Buffer devSrc( task.GetContext(), CL_MEM_READ_ONLY, width*height );
cl::Buffer devDst( task.GetContext(), CL_MEM_WRITE_ONLY, width*height );
...

vector< cl::Event > steps;
steps.push_back( cl::UserEvent( task.GetContext() ) );
steps.push_back( cl::Event() );
steps.push_back( cl::Event() );
steps.push_back( cl::Event() );

vector< cl::Event > ve0; ve0.push_back( steps[0] );
queue.enqueueWriteBuffer( devSrc, CL_FALSE, 0, width*height, src.GetPixelPtr(), &ve0, &steps[1] );

kernelPitch0.setArg( 0, devSrc );
kernelPitch0.setArg( 1, devDst );
...

const int LOCAL_WIDTH = 64, LOCAL_HEIGHT = 4;
cl::NDRange localws( LOCAL_WIDTH, LOCAL_HEIGHT );
cl::NDRange globalws( 
  LOCAL_WIDTH * (int)ceil( (double)(roiRight-roiLeft+1) / (double)LOCAL_WIDTH ), 
  LOCAL_HEIGHT * (int)ceil( (double)(roiBottom-roiTop+1) / (double)LOCAL_HEIGHT )
);

vector< cl::Event > ve1; ve1.push_back( steps[1] );
queue.enqueueNDRangeKernel( kernelPitch0, cl::NullRange, globalws, localws, &ve1, &steps[2] );

vector< cl::Event > ve2; ve2.push_back( steps[2] );
queue.enqueueReadBuffer( devDst, CL_FALSE, 0, width*height, dst.GetPixelPtr(), &ve2, &steps[3]  );

(static_cast< cl::userevent >(&steps[0]))->setStatus( CL_COMPLETE );
steps[3].wait();
queue.finish();

...
The device code is as below
__kernel
void Pitch0(
  __global uchar *d_src, 
  __global uchar *d_dst,
  int width, int height,
  int roiLeft, int roiTop, int roiRight, int roiBottom,
  int integerPitch, float toCeiling, float toFloor )
{
  int x = roiLeft + get_global_id(0);
  int y = roiTop + get_global_id(1);
  int idx = x + y * width;

  bool isInBound = x <= roiRight && y <= roiBottom;

  float east = 
    (float)d_src[ idx - integerPitch - 1 ] * toCeiling +
    (float)d_src[ idx - integerPitch ] * toFloor;
  float west = 
    (float)d_src[ idx + integerPitch ] * toFloor + 
    (float)d_src[ idx + integerPitch + 1 ] * toCeiling;

  float diff = ( (float)d_src[idx] * 2.0f  - ( east + west ) ) / 2.0f;
  diff = diff >= 0 ? diff : -diff;

  if( isInBound )
  {
    d_dst[ idx ] = (uchar)( diff  );
  }
}

The resulst is as below
Step 1 : start 0 ns, end 1165280 ns, duration 1165280 ns, 3407.96 MB/s
Step 2 : start 1388384 ns, end 2675712 ns, duration 1287328 ns, 3084.86 MB/s
Step 3 : start 2854816 ns, end 4050816 ns, duration 1196000 ns, 3320.43 MB/s
Total : duration 4050816 ns, 980.35 MB/s


The memory copy from host to device and device to host is around 1.2 milli second (ms). This is similar to the non-pinned memory copy time. And it means I have to find an equivalent of pinned memory in OpenCL.

But somewhat confusing result is the kernel time which is just around 1.3 ms. This is almost two times faster than the CUDA kernel time - it took 2.1 ms at best with texture access version.

I don't expect that NVidia spent more time optimizing OpenCL compiler than CUDA compiler. Or is Apple such a big customer which can't be disappointed ?


No comments: