Computing Memoir: CUDA - PTX code analysis

Remember that OpenCL version is much faster than CUDA version. With almost identical code, I was puzzled to see two times slower in CUDA and thought that PTX assembly comparison can enlighten me.

Here is OpenCL version of C code and PTX assembly code. Those comments are added to follow.

__kernel
void Pitch0(
  __global uchar *d_src, 
  __global uchar *d_dst,
  int width, int height,
  int roiLeft, int roiTop, int roiRight, int roiBottom,
  int integerPitch, float toCeiling, float toFloor )
{
  int x = roiLeft + get_global_id(0);
  int y = roiTop + get_global_id(1);
  int idx = x + y * width;

  bool isInBound = x <= roiRight && y <= roiBottom;

  float east = 
    (float)d_src[ idx - integerPitch - 1 ] * toCeiling +
    (float)d_src[ idx - integerPitch ] * toFloor;
  float west = 
    (float)d_src[ idx + integerPitch ] * toFloor + 
    (float)d_src[ idx + integerPitch + 1 ] * toCeiling;

  float diff = ( (float)d_src[idx] * 2.0f  - ( east + west ) ) / 2.0f;
  diff = diff >= 0 ? diff : -diff;

  if( isInBound )
  {
    d_dst[ idx ] = (uchar)( diff  );
  }
}

.entry Pitch0(
  .param .u32 .ptr .global .align 1 Pitch0_param_0,
  .param .u32 .ptr .global .align 1 Pitch0_param_1,
  .param .u32 Pitch0_param_2,
  .param .u32 Pitch0_param_3,
  .param .u32 Pitch0_param_4,
  .param .u32 Pitch0_param_5,
  .param .u32 Pitch0_param_6,
  .param .u32 Pitch0_param_7,
  .param .u32 Pitch0_param_8,
  .param .f32 Pitch0_param_9,
  .param .f32 Pitch0_param_10
)
{
  .reg .f32   %f< 18 >;
  .reg .pred   %p< 5 >;
  .reg .s32   %r< 32 >;
  .reg .s16   %rc< 7 >;


  ld.param.u32   %r11, [Pitch0_param_0];  // d_src
  ld.param.u32   %r12, [Pitch0_param_2];  // width
  ld.param.u32   %r13, [Pitch0_param_4];  // roiLeft
  ld.param.u32   %r14, [Pitch0_param_5];  // roiTop
  ld.param.u32   %r15, [Pitch0_param_6];  // roiRight
  ld.param.u32   %r16, [Pitch0_param_7];  // roiBottom
  ld.param.u32   %r17, [Pitch0_param_8];  // integerPitch
  ld.param.f32   %f2, [Pitch0_param_9];   // toCeiling
  ld.param.f32   %f3, [Pitch0_param_10];  // toFloor

  mov.u32   %r3, %envreg3;
  mov.u32   %r4, %ntid.x;
  mov.u32   %r5, %ctaid.x;
  mov.u32   %r6, %tid.x;
  add.s32   %r18, %r3, %r13;           
  mad.lo.s32   %r19, %r5, %r4, %r18;   // r19 = ctaid.x * ntid.x + roiLeft
  add.s32   %r20, %r19, %r6;           // r20 = x = roiLeft + get_global_id(0) = tid.x + ctaid.x * ntid.x + roiLeft 

  mov.u32   %r7, %envreg4;
  mov.u32   %r8, %ntid.y;
  mov.u32   %r9, %ctaid.y;
  mov.u32   %r10, %tid.y;
  add.s32   %r21, %r7, %r14;           
  mad.lo.s32   %r22, %r9, %r8, %r21;   // r22 = ctaid.x * ntid.y + roiTop
  add.s32   %r23, %r22, %r10;          // r23 = y = roiTop + get_global_id(1) = tid.y + ctaid.y * ntid.y + roiTop
  
  mad.lo.s32   %r2, %r23, %r12, %r20;  // r2 = idx = width * y + x
  
  setp.le.s32   %p1, %r23, %r16;       // p1 = y <= roiBottom
  setp.le.s32   %p2, %r20, %r15;       // p2 = x <= roiRight
  and.pred    %p3, %p1, %p2;           // p3 = isInBound = p1 & p2
  
  not.b32   %r24, %r17;                
  add.s32   %r25, %r2, %r24;           
  add.s32   %r26, %r11, %r25;           
  ld.global.u8   %rc1, [%r26];          
  cvt.rn.f32.u8   %f4, %rc1;           // f4 = float( d_src[ idx - integerPitch - 1] )
  ld.global.u8   %rc2, [%r26+1];       
  cvt.rn.f32.u8   %f5, %rc2;           // f5 = float( d_src[ idx - integerPitch] )
  mul.f32   %f6, %f5, %f3;             // f6 = f5 * toFloor
  fma.rn.f32   %f7, %f4, %f2, %f6;     // f7 = east = f4 * toCeiling + f5 * toFloor
  
  add.s32   %r27, %r2, %r17;           
  add.s32   %r28, %r11, %r27;    
  ld.global.u8   %rc3, [%r28];    
  cvt.rn.f32.u8   %f8, %rc3;           // f8 = float( d_src[ idx + integerPitch ] )
  ld.global.u8   %rc4, [%r28+1];    
  cvt.rn.f32.u8   %f9, %rc4;           // f9 = float( d_src[ idx + integerPitch + 1 ] )
  mul.f32   %f10, %f9, %f2;            // f10 = f9 * toCeiling
  fma.rn.f32   %f11, %f8, %f3, %f10;   // f11 = west = f8 * toFloor + f10 * toCeiling
  
  add.s32   %r29, %r11, %r2;           
  ld.global.u8   %rc5, [%r29];    
  cvt.rn.f32.u8   %f12, %rc5;          // f12 = d_src[ idx ]
  add.f32   %f13, %f12, %f12;          
  add.f32   %f14, %f7, %f11;           
  sub.f32   %f15, %f13, %f14;          // f15 = d_src[ idx ]*2 - (east+west)
  div.full.f32   %f16, %f15, 0f40000000; // f16 = diff = (d_src[ idx ]*2 - (east+west)) / 2.0
  setp.ltu.f32   %p4, %f16, 0f00000000;  // p4 = diff > 0
  neg.f32   %f17, %f16;                
  selp.f32   %f1, %f17, %f16, %p4;     // f1 = diff = diff > 0 ? diff : diff 
  @%p3 bra   BB1_2;                    // if isInBound, jump BB1_2

  ret;

BB1_2:
  cvt.rzi.u16.f32   %rc6, %f1          // rc6 = ushort( diff )
  ld.param.u32   %r31, [Pitch0_param_1];  
  add.s32   %r30, %r31, %r2;           // r30 = d_dst + idx
  st.global.u8   [%r30], %rc6;         // d_dst[idx] = uchar( ushort(diff) )
  ret;
}

Now CUDA version of PTX code is similar. One noticeable difference is more 'ld.global' instructions due to structures used for kernel function arguments. Here is snippet.

__global__ 
void KernelPitch0( 
  unsigned char* d_src, unsigned char* d_dst, 
  const Area* imgSize, const Region* roi, const PitchContext* pc )
{
  ...
  int idx = x + y * imgSize->Width;
  bool isInBound = x < = roi->Right && y < = roi->Bottom;
  if( isInBound )
  {
    float east = 
      (float)d_src[ idx - pc->IntegerPitch - 1 ] * pc - >ToCeiling +
      (float)d_src[ idx - pc->IntegerPitch ] * pc - >ToFloor;
    ...
}

.visible .entry _Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext(
  .param .u32 _Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext_param_0,
  .param .u32 _Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext_param_1,
  .param .u32 _Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext_param_2,
  .param .u32 _Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext_param_3,
  .param .u32 _Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext_param_4
)
{
  ...
  ld.param.u32   %r9, [_Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext_param_2];  // imgSize
  ld.param.u32   %r10, [_Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext_param_3]; // roi
  ld.param.u32   %r11, [_Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext_param_4]; // pc

  ...
  ld.global.u32   %r16, [%r4];   // r16 = roi.Left
  add.s32   %r17, %r15, %r16   // r17 = roi.Left + tid.x

  ... 
  ld.global.u32   %r22, [%r4+4];  // r22 = roi.Top
  add.s32   %r23, %r21, %r22;
  mad.lo.s32   %r5, %r19, %r20, %r23;         // r5 = y = ntid.y * ctaid.y + roi.Top + tid.y

  ...
  ld.global.u32   %r24, [%r12];   // r24 = imgSize.x
  mad.lo.s32   %r6, %r5, %r24, %r18;         // r6 = idx = imgSize.x * y + x 

  ...
  ld.global.u32   %r25, [%r4+8];  // r25 = roi.Right
  setp.gt.s32   %p1, %r18, %r25;  // p1 = x > roi.Right
  @%p1 bra   BB0_3;
  ...

With this difference in mind, I have replaced all the structure based argument with value type like OpenCL C. Then got 1GB/s. Before it was meager 428MB/s.

Entering test case "TestProcess2"
Total time taken : 3955.87 micro sec 
Size of image : 3.98 MB 
Processing time : 1005.97 MB/s 
Leaving test case "TestProcess2"; testing time: 540ms

Using structure on the kernel function arguments makes the structure resides on the device global memory and it inevitably ask device to copy the structure member to the thread register. And it consumes up the valuable memory traffic.

Now without structure, the function just needs number of ld.param.u32. ".param" state space is said to be defined per-grid. Then the physical location is likely to be global memory - NVidia says it is implementation specific and says some use global memory. If it comes from the global memory, then using structure as argument shouldn't be two times slower than value type argument. I mean in either way, it has to be loaded from the global memory.

Other possible explanation may come from the the locality of ld.param.u32. This instruction is being used at the beginning of function to load all the argument values regardless usage in the code. I guess those argument may stay in close proximity which can be good for cache.

Another thing to note is the kernel time of 1.7ms. WIth OpenCL, I got 1.2ms. Still there is .6ms slowness in the CUDA.

shared memory version
Total time taken : 4406.42 micro sec 
Size of image : 3.98 MB 
Processing time : 903.11 MB/s 

texture memory version
Total time taken : 4372.73 micro sec 
Size of image : 3.98 MB 
Processing time : 910.07 MB/s

Shared memory version and texture memory version are tried as above result but these are not as fast as plain version. Another thing to figure out.

Computing Memoir

Tuesday, March 12, 2013

CUDA - PTX code analysis

No comments:

Blog Archive

About Me