As the memory does not have a common layout, it should be created with dedicated method - cuaMallocArray and is read-only and only accessible using special texture fetch functions. Well in 5.0 CUDA, they say it can be written using Surface API but that's not discussed in here
The CUDA programming guide tells that there are two API for texture. One is Texture Reference API and the other is Texture Object API. The Texture Reference API is used in here. It seems that static texture variable is the only short-fall compared to Texture Object API. And Object API is introduced at 5.0 which means there is not many help if stuck.
// texture<> is only allowed as file scope static variable // cudaReadModeElementType can't be used with cudaFilterModeLinear with uint8_t texel texture< uint8_t, cudaTextureType2D, cudaReadModeNormalizedFloat > gPitch2Texture; __global__ void KernelPitch2() { ... // add 0.5f to all texture coordinates for center offset float east, west, center; east = tex2D( gPitch2Texture, (float)( x - pc->Pitch + 0.5f), (float)(y+0.5f) ) * 255; west = tex2D( gPitch2Texture, (float)( x + pc->Pitch + 0.5f), (float)(y+0.5f) ) * 255; center = tex2D( gPitch2Texture, (float)(x+0.5f), (float)(y+0.5f) ) * 255; float diff = ( center * 2.0f - ( east + west ) ) / 2.0f; ... } void CUDA_Pitch2( ... ) { ... cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc< uint8_t >(); // texture memory should be created using cudaArray and binded with texture handle struct cudaArray *devSrcArray; cudaMallocArray( &devSrcArray, &channelDesc, width, height ); cudaMemcpyToArray( devSrcArray, 0, 0, h_src, imgSize, cudaMemcpyHostToDevice ); gPitch2Texture.addressMode[0] = cudaAddressModeClamp; gPitch2Texture.addressMode[1] = cudaAddressModeClamp; gPitch2Texture.filterMode = cudaFilterModeLinear; gPitch2Texture.normalized = false; cudaBindTextureToArray( gPitch2Texture, devSrcArray, channelDesc ); const int THREAD_WIDTH = 64, THREAD_HEIGHT = 4; dim3 blocks( THREAD_WIDTH, THREAD_HEIGHT ); dim3 grids( (int)ceil( (double)(roiRight-roiLeft+1) / (double)blocks.x ), (int)ceil( (double)(roiBottom-roiTop+1) / (double)blocks.y ) ); ... KernelPitch2<<< grids, blocks >>>( ... ) ... cudaFreeArray( devSrcArray ); }
No comments:
Post a Comment