Here is OpenCL version of C code and PTX assembly code. Those comments are added to follow.
__kernel void Pitch0( __global uchar *d_src, __global uchar *d_dst, int width, int height, int roiLeft, int roiTop, int roiRight, int roiBottom, int integerPitch, float toCeiling, float toFloor ) { int x = roiLeft + get_global_id(0); int y = roiTop + get_global_id(1); int idx = x + y * width; bool isInBound = x <= roiRight && y <= roiBottom; float east = (float)d_src[ idx - integerPitch - 1 ] * toCeiling + (float)d_src[ idx - integerPitch ] * toFloor; float west = (float)d_src[ idx + integerPitch ] * toFloor + (float)d_src[ idx + integerPitch + 1 ] * toCeiling; float diff = ( (float)d_src[idx] * 2.0f - ( east + west ) ) / 2.0f; diff = diff >= 0 ? diff : -diff; if( isInBound ) { d_dst[ idx ] = (uchar)( diff ); } } .entry Pitch0( .param .u32 .ptr .global .align 1 Pitch0_param_0, .param .u32 .ptr .global .align 1 Pitch0_param_1, .param .u32 Pitch0_param_2, .param .u32 Pitch0_param_3, .param .u32 Pitch0_param_4, .param .u32 Pitch0_param_5, .param .u32 Pitch0_param_6, .param .u32 Pitch0_param_7, .param .u32 Pitch0_param_8, .param .f32 Pitch0_param_9, .param .f32 Pitch0_param_10 ) { .reg .f32 %f< 18 >; .reg .pred %p< 5 >; .reg .s32 %r< 32 >; .reg .s16 %rc< 7 >; ld.param.u32 %r11, [Pitch0_param_0]; // d_src ld.param.u32 %r12, [Pitch0_param_2]; // width ld.param.u32 %r13, [Pitch0_param_4]; // roiLeft ld.param.u32 %r14, [Pitch0_param_5]; // roiTop ld.param.u32 %r15, [Pitch0_param_6]; // roiRight ld.param.u32 %r16, [Pitch0_param_7]; // roiBottom ld.param.u32 %r17, [Pitch0_param_8]; // integerPitch ld.param.f32 %f2, [Pitch0_param_9]; // toCeiling ld.param.f32 %f3, [Pitch0_param_10]; // toFloor mov.u32 %r3, %envreg3; mov.u32 %r4, %ntid.x; mov.u32 %r5, %ctaid.x; mov.u32 %r6, %tid.x; add.s32 %r18, %r3, %r13; mad.lo.s32 %r19, %r5, %r4, %r18; // r19 = ctaid.x * ntid.x + roiLeft add.s32 %r20, %r19, %r6; // r20 = x = roiLeft + get_global_id(0) = tid.x + ctaid.x * ntid.x + roiLeft mov.u32 %r7, %envreg4; mov.u32 %r8, %ntid.y; mov.u32 %r9, %ctaid.y; mov.u32 %r10, %tid.y; add.s32 %r21, %r7, %r14; mad.lo.s32 %r22, %r9, %r8, %r21; // r22 = ctaid.x * ntid.y + roiTop add.s32 %r23, %r22, %r10; // r23 = y = roiTop + get_global_id(1) = tid.y + ctaid.y * ntid.y + roiTop mad.lo.s32 %r2, %r23, %r12, %r20; // r2 = idx = width * y + x setp.le.s32 %p1, %r23, %r16; // p1 = y <= roiBottom setp.le.s32 %p2, %r20, %r15; // p2 = x <= roiRight and.pred %p3, %p1, %p2; // p3 = isInBound = p1 & p2 not.b32 %r24, %r17; add.s32 %r25, %r2, %r24; add.s32 %r26, %r11, %r25; ld.global.u8 %rc1, [%r26]; cvt.rn.f32.u8 %f4, %rc1; // f4 = float( d_src[ idx - integerPitch - 1] ) ld.global.u8 %rc2, [%r26+1]; cvt.rn.f32.u8 %f5, %rc2; // f5 = float( d_src[ idx - integerPitch] ) mul.f32 %f6, %f5, %f3; // f6 = f5 * toFloor fma.rn.f32 %f7, %f4, %f2, %f6; // f7 = east = f4 * toCeiling + f5 * toFloor add.s32 %r27, %r2, %r17; add.s32 %r28, %r11, %r27; ld.global.u8 %rc3, [%r28]; cvt.rn.f32.u8 %f8, %rc3; // f8 = float( d_src[ idx + integerPitch ] ) ld.global.u8 %rc4, [%r28+1]; cvt.rn.f32.u8 %f9, %rc4; // f9 = float( d_src[ idx + integerPitch + 1 ] ) mul.f32 %f10, %f9, %f2; // f10 = f9 * toCeiling fma.rn.f32 %f11, %f8, %f3, %f10; // f11 = west = f8 * toFloor + f10 * toCeiling add.s32 %r29, %r11, %r2; ld.global.u8 %rc5, [%r29]; cvt.rn.f32.u8 %f12, %rc5; // f12 = d_src[ idx ] add.f32 %f13, %f12, %f12; add.f32 %f14, %f7, %f11; sub.f32 %f15, %f13, %f14; // f15 = d_src[ idx ]*2 - (east+west) div.full.f32 %f16, %f15, 0f40000000; // f16 = diff = (d_src[ idx ]*2 - (east+west)) / 2.0 setp.ltu.f32 %p4, %f16, 0f00000000; // p4 = diff > 0 neg.f32 %f17, %f16; selp.f32 %f1, %f17, %f16, %p4; // f1 = diff = diff > 0 ? diff : diff @%p3 bra BB1_2; // if isInBound, jump BB1_2 ret; BB1_2: cvt.rzi.u16.f32 %rc6, %f1 // rc6 = ushort( diff ) ld.param.u32 %r31, [Pitch0_param_1]; add.s32 %r30, %r31, %r2; // r30 = d_dst + idx st.global.u8 [%r30], %rc6; // d_dst[idx] = uchar( ushort(diff) ) ret; }
Now CUDA version of PTX code is similar. One noticeable difference is more 'ld.global' instructions due to structures used for kernel function arguments. Here is snippet.
__global__ void KernelPitch0( unsigned char* d_src, unsigned char* d_dst, const Area* imgSize, const Region* roi, const PitchContext* pc ) { ... int idx = x + y * imgSize->Width; bool isInBound = x < = roi->Right && y < = roi->Bottom; if( isInBound ) { float east = (float)d_src[ idx - pc->IntegerPitch - 1 ] * pc - >ToCeiling + (float)d_src[ idx - pc->IntegerPitch ] * pc - >ToFloor; ... } .visible .entry _Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext( .param .u32 _Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext_param_0, .param .u32 _Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext_param_1, .param .u32 _Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext_param_2, .param .u32 _Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext_param_3, .param .u32 _Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext_param_4 ) { ... ld.param.u32 %r9, [_Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext_param_2]; // imgSize ld.param.u32 %r10, [_Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext_param_3]; // roi ld.param.u32 %r11, [_Z12KernelPitch0PhS_PKN4SSKL10Inspection4AreaEPKNS1_6RegionEPK12PitchContext_param_4]; // pc ... ld.global.u32 %r16, [%r4]; // r16 = roi.Left add.s32 %r17, %r15, %r16 // r17 = roi.Left + tid.x ... ld.global.u32 %r22, [%r4+4]; // r22 = roi.Top add.s32 %r23, %r21, %r22; mad.lo.s32 %r5, %r19, %r20, %r23; // r5 = y = ntid.y * ctaid.y + roi.Top + tid.y ... ld.global.u32 %r24, [%r12]; // r24 = imgSize.x mad.lo.s32 %r6, %r5, %r24, %r18; // r6 = idx = imgSize.x * y + x ... ld.global.u32 %r25, [%r4+8]; // r25 = roi.Right setp.gt.s32 %p1, %r18, %r25; // p1 = x > roi.Right @%p1 bra BB0_3; ...
With this difference in mind, I have replaced all the structure based argument with value type like OpenCL C. Then got 1GB/s. Before it was meager 428MB/s.
Entering test case "TestProcess2" Total time taken : 3955.87 micro sec Size of image : 3.98 MB Processing time : 1005.97 MB/s Leaving test case "TestProcess2"; testing time: 540ms
Using structure on the kernel function arguments makes the structure resides on the device global memory and it inevitably ask device to copy the structure member to the thread register. And it consumes up the valuable memory traffic.
Now without structure, the function just needs number of ld.param.u32. ".param" state space is said to be defined per-grid. Then the physical location is likely to be global memory - NVidia says it is implementation specific and says some use global memory. If it comes from the global memory, then using structure as argument shouldn't be two times slower than value type argument. I mean in either way, it has to be loaded from the global memory.
Other possible explanation may come from the the locality of ld.param.u32. This instruction is being used at the beginning of function to load all the argument values regardless usage in the code. I guess those argument may stay in close proximity which can be good for cache.
Another thing to note is the kernel time of 1.7ms. WIth OpenCL, I got 1.2ms. Still there is .6ms slowness in the CUDA.
shared memory version Total time taken : 4406.42 micro sec Size of image : 3.98 MB Processing time : 903.11 MB/s texture memory version Total time taken : 4372.73 micro sec Size of image : 3.98 MB Processing time : 910.07 MB/s
Shared memory version and texture memory version are tried as above result but these are not as fast as plain version. Another thing to figure out.
No comments:
Post a Comment