To pack 8 binary pixels to a byte, it is needed to read result from the neighbor's binary pixel value. This neighbor's pixel can be written to a global memory and read back but that will be expensive write and read operations. Instead, we can write binary result to a local memory and read 8 pixels back and pack to a byte and write to the global memory. Another thing to be careful is the bit order - which bit to turn on for the pixel index 0. Is it the most significant bit or least significant bit ? In here, I use the most significant bit for the pixel index 0 and decrease the bit location for increased pixel index. In this way, it is easy to read byte value and guess the location of the pixel. Below is the kernel code
float PitchedSubPixel( __global uchar* src, int idx, float weightLeft, float weightRight )
{
return (float)src[ idx ] * weightLeft + (float)src[ idx+1 ] * weightRight;
}
float PixelDifference( float center, float east, float west )
{
float diff = ( center * 2.0f - ( east + west ) ) / 2.0f;
return diff >= 0 ? diff : -diff;
}
__kernel
void PitchThreshold(
__global uchar *d_src,
__global uchar *d_dst,
int width, int height,
int roiLeft, int roiTop, int roiRight, int roiBottom,
int integerPitch, float toCeiling, float toFloor,
__local uchar *localDst, // this is the local memory where temporary binary pixel is stored
uchar threshold )
{
int x = (roiLeft & (~0x07)) + get_global_id(0); // x always starts with multiple of 8 - byte aligned
int y = roiTop + get_global_id(1);
int idx = x + y * width;
bool isInBound = x >= roiLeft && x <= roiRight && y <= roiBottom;
float east = 0.0f, west = 0.0f, diff = 0.0f;
if( isInBound )
{
east = PitchedSubPixel( d_src, idx-integerPitch-1, toCeiling, toFloor );
west = PitchedSubPixel( d_src, idx+integerPitch, toFloor, toCeiling );
diff = PixelDifference( (float)d_src[idx], east, west );
}
uchar binary = ((uchar)( diff ) < threshold) ? 0 : 1; // fixed threshold, if outside of ROI, then 0
int localX = get_local_id(0); // localX is 0 to 63. Refer host code
int localIdx = get_local_id(1) * get_local_size(0) + localX; // get_local_id(1) is 0 to 3. get_local_size(0) is 64 as width. Refer host code
binary = binary << (7-(localX & 0x07)); // i.e, if localX is 10, then it is 1 << 5,
localDst[localIdx] = binary; // putting the shifted binary value to local memory
isInBound = x <= roiRight && y <= roiBottom; // ROI start at byte aligned
barrier( CLK_LOCAL_MEM_FENCE ); // wait for the neighbor's local memory is written
if( isInBound && (localX & 0x07) == 0 )
{
uchar packed = // read 8 pixels and pack as a byte
localDst[ localIdx ] | localDst[ localIdx+1 ] | localDst[ localIdx+2 ] | localDst[ localIdx+3 ] |
localDst[ localIdx+4 ] | localDst[ localIdx+5 ] | localDst[ localIdx+6 ] | localDst[ localIdx+7 ] ;
d_dst[ idx >> 3 ] = packed; // write to global memory
}
}
Below is the host code snippet
const int LOCAL_WIDTH = 64, LOCAL_HEIGHT = 4;
cl::NDRange localws( LOCAL_WIDTH, LOCAL_HEIGHT );
cl::NDRange globalws(
LOCAL_WIDTH * (int)ceil( (double)(roiRight-roiLeft+1) / (double)LOCAL_WIDTH ),
LOCAL_HEIGHT * (int)ceil( (double)(roiBottom-roiTop+1) / (double)LOCAL_HEIGHT )
);
...
kernelPitch0.setArg( 11, LOCAL_WIDTH*LOCAL_HEIGHT*sizeof(uint8_t), NULL ); // request to allocate local memory
kernelPitch0.setArg( 12, threshold );
vector ve1; ve1.push_back( steps[1] );
queue.enqueueNDRangeKernel( kernelPitch0, cl::NullRange, globalws, localws, &ve1, &steps[2] );
Just pitch comparison was 1.4GB/s but now it is around 1.3GB/s. Refer below time profiling. Step 1 : start 0 ns, end 646304 ns, duration 646304 ns, 6144.53 MB/s Step 2 : start 1086688 ns, end 2980640 ns, duration 1893952 ns, 2096.80 MB/s <- was 3085 MB/s Step 3 : start 3009344 ns, end 3090080 ns, duration 80736 ns, 49187.88 MB/s <- was 6330 MB/s Total : duration 3090080 ns, 1285.16 MB/s <- was 1406 MB/s