void SSE_Pitch0( const uint8_t *h_src, uint8_t *h_dst, int width, int height, int roiLeft, int roiTop, int roiRight, int roiBottom, float horPitch, float verPitch ) { int sizeOfXMMInWords = SSE::SizeOfDataType / 2; int integerPitch = (int)horPitch; uint16_t toFloor = (uint16_t)( (horPitch - (int)horPitch) * 0xFF); uint16_t toCeiling = 0xFF - toFloor; std::vector< uint16_t > toFloors( sizeOfXMMInWords, toFloor ); std::vector< uint16_t > toCeilings( sizeOfXMMInWords, toCeiling ); __m128i xmmFloor, xmmCeiling; xmmFloor = _mm_loadu_si128( (const __m128i*)( toFloors.front()) ); xmmCeiling = _mm_loadu_si128( (const __m128i*)( toCeilings.front()) ); for ( int y=roiTop; y<=roiBottom; y++ ) { int x = roiLeft; const uint8_t *source = h_src + width*y+x; uint8_t *target = h_dst + width*y+x; for ( ; x<=roiRight; x+=sizeOfXMMInWords, source+=sizeOfXMMInWords, target+=sizeOfXMMInWords ) { __m128i xmmTemp, xmmLeft, xmmRight, xmmEast, xmmWest, xmmCenter; // _mm_cvtepu8_epi16 can't load value from memory directly unlike PMOVZXBW xmmTemp = _mm_loadu_si128( (const __m128i*)(source-integerPitch-1) ); xmmEast = _mm_cvtepu8_epi16( xmmTemp ); xmmTemp = _mm_srli_si128( xmmTemp, 1 ); xmmWest = _mm_cvtepu8_epi16( xmmTemp ); xmmEast = _mm_mullo_epi16( xmmEast, xmmFloor ); xmmWest = _mm_mullo_epi16( xmmWest, xmmCeiling ); xmmLeft = _mm_adds_epu16( xmmEast, xmmWest ); xmmLeft = _mm_srli_epi16( xmmLeft, 1 ); xmmTemp = _mm_loadu_si128( (const __m128i*)(source+integerPitch) ); xmmEast = _mm_cvtepu8_epi16( xmmTemp ); xmmTemp = _mm_srli_si128( xmmTemp, 1 ); xmmWest = _mm_cvtepu8_epi16( xmmTemp ); xmmEast = _mm_mullo_epi16( xmmEast, xmmCeiling ); xmmWest = _mm_mullo_epi16( xmmWest, xmmFloor ); xmmRight = _mm_adds_epu16( xmmEast, xmmWest ); xmmRight = _mm_srli_epi16( xmmRight, 1 ); xmmTemp = _mm_loadu_si128( (const __m128i*)(source) ); xmmCenter = _mm_cvtepu8_epi16( xmmTemp ); xmmCenter = _mm_slli_epi16( xmmCenter, 8 ); xmmTemp = xmmCenter; xmmCenter = _mm_subs_epu16( xmmCenter, xmmLeft ); xmmCenter = _mm_subs_epu16( xmmCenter, xmmRight ); xmmLeft = _mm_adds_epu16( xmmLeft, xmmRight); xmmLeft = _mm_subs_epu16( xmmLeft, xmmTemp ); xmmCenter = _mm_adds_epu16( xmmCenter, xmmLeft ); xmmCenter = _mm_srli_epi16( xmmCenter, 8 ); xmmTemp = _mm_xor_si128( xmmTemp, xmmTemp); xmmTemp = _mm_packus_epi16( xmmCenter, xmmTemp ); _mm_storel_epi64( (__m128i*)(target), xmmTemp ); } } }
Suprisingly, this is just making 160 MB/s throughput. The reason is obvious when we see the disassembled code. The compiler doesn't utilize 16 xmm registers. It seems that it think there is only one xmm register. It is constantly push/pop xmm register value to the stack. This doesn't improve with -msse2 option. Or other optimization flag.
__m128i xmmTemp, xmmLeft, xmmRight, xmmEast, xmmWest, xmmCenter; xmmTemp = _mm_loadu_si128( (const __m128i*)(source-integerPitch-1) ); 00403616 mov edx,dword ptr [source] 0040361C sub edx,dword ptr [ebp-1Ch] 0040361F sub edx,1 00403622 movdqu xmm0,xmmword ptr [edx] 00403626 movdqa xmmword ptr [ebp-0D0h],xmm0 0040362E movdqa xmm0,xmmword ptr [ebp-0D0h] 00403636 movdqa xmmword ptr [xmmTemp],xmm0 xmmEast = _mm_cvtepu8_epi16( xmmTemp ); 0040363E pmovzxbw xmm0,mmword ptr [xmmTemp] 00403647 movdqa xmmword ptr [ebp-0F0h],xmm0 0040364F movdqa xmm0,xmmword ptr [ebp-0F0h] 00403657 movdqa xmmword ptr [xmmEast],xmm0 xmmTemp = _mm_srli_si128( xmmTemp, 1 ); 0040365F movdqa xmm0,xmmword ptr [xmmTemp] 00403667 psrldq xmm0,1 0040366C movdqa xmmword ptr [ebp-110h],xmm0 00403674 movdqa xmm0,xmmword ptr [ebp-110h] 0040367C movdqa xmmword ptr [xmmTemp],xmm0
The compiler used in here is VC++ 2008 sp1. In the internet, there is saying that VC++2010 is much better than old version. But as there is intel compiler or GCC that allows inline assembler, is still a merit to pursue the intrinsics ? Wolud the compiler makes better code than hand-written code in the end ? I guess the answer depends. Still the fact that the VC++2008 instrincs is incompentent on SSE stands.
No comments:
Post a Comment