光栅处理函数(二)"/>
游戏开发中常用光栅处理函数(二)
光栅预乘Alpha处理函数,像素格式为BBGGRRAA(BB为低地址), 各个版本都集齐了(MMX、SSE、SSE2),原先用于桌面透明窗口(UpdateLayeredWindow)。注意几点: 1. MMX版本一次处理2个pixel、SSE版本一次处理2个pixel(但是指令更简洁)、SSE2版本一次处理4个pixel, 所以效率层面是 MMX < SSE < SSE2。
2. 处理行内剩余像素的时候,指令会取“当前行末剩余像素” + “下一行首个像素”,当处理"最后一行"的时候, 可能会导致“读取内存越界”。一般情况下都不太会有问题,后面4个字节,取出来但是不会用。 MMX和SSE版本都是一次性处理2个pixel,如果光栅宽度是奇数,就会取 “下一行首个像素”。
3. SSE2版本一次处理4个pixel,如果光栅宽度不是4的倍数,当前版本未做处理,不会导致“读取内存越界”, 但是效果上有缺陷。如果你要用SSE2版本,保证光栅宽度是4的倍数。
//< 32位光栅预乘Alpha,使用MMX指令处理
/************************************************************************/void BltSurface32ToDIB32_SelfMulAlphaMMX( void *pDst, void *pSrc, unsigned int width, unsigned int height, unsigned int src_pitch )
{int src_pitch_sub_dst_pitch; //src pointer对齐到下一行scanline,需要跳过多少字节__asm{//取参数,判断width和height是否有任一为0mov eax, height //eax = heighmov ebx, widthmul ebx //width * heighttest eax, eax //影响ZFjz end_pixel//常量赋值mov esi, pSrcmov edi, pDstpcmpeqd mm5, mm5 //mm5 = 0xffffffff_ffffffffpcmpeqd mm6, mm6 //mm6 = 0xffffffff_ffffffffpsrld mm5, 8 //mm5 = 0x00ffffff_00ffffffpsrlw mm6, 8 //mm6 = 0x00ff_00ff_00ff_00ffpxor mm7, mm7 //mm7 = 0x0//判断pitchmov edx, src_pitchshl ebx, 2 //每个像素4个字节, dst_pitch = width * 4sub edx, ebx //src_pitch - dst_pitchjnz diff_pitch//same_pitch:mov ecx, eaxmov edx, 1 //how many lines,eax和edx构成2层循环and ecx, 1 //一行上剩下多少个不成对的象素,same_pitch时就是(width*height & 1),diff_pitch时就是(width & 1)shr eax, 1 //一行上主循环多少次,same_pitch时就是(width*height >> 1)diff_pitchh时就是(width >> 1)jmp test_pair_pixeldiff_pitch:mov src_pitch_sub_dst_pitch, edx //src_pitch - dst_pitchmov eax, widthmov edx, height //how many lines,eax和edx构成2层循环mov ecx, eaxand ecx, 1 //一行上剩下多少个不成对的象素,same_pitch时就是(width*height & 1),diff_pitch时就是(width & 1)shr eax, 1 //一行上主循环多少次,same_pitch时就是(width*height >> 1)diff_pitchh时就是(width >> 1)mov ebx, eax //main loop count on every scanlinejmp test_pair_pixelloop_line:mov eax, ebxloop_pair_pixel:movq mm0, [esi] //mm0 = 0xaarrggbb_AARRGGBBmovq mm4, mm5 //mm4 = mm5 = 0x00ffffff_00ffffffmovq mm1, mm0 //mm1 = mm0 = 0xaarrggbb_AARRGGBBmovq mm2, mm0 //mm2 = mm0 = 0xaarrggbb_AARRGGBBpandn mm4, mm0 //保存alpha, mm4 = 0xaa000000_AA000000movq mm3, mm0 //mm3 = mm0 = 0xaarrggbb_AARRGGBBpunpckhbw mm1, mm7 //扩展每个通道, mm1 = 0x00aa_00rr_00gg_00bbpunpcklbw mm0, mm7 //mm0 = 0x00AA_00RR_00GG_00BBpunpcklbw mm2, mm2 //构建2个象素的alpha, mm2 = 0xAAAA_RRRR_GGGG_BBBBpunpckhbw mm3, mm3 //mm3 = 0xaaaa_rrrr_gggg_bbbbpunpckhwd mm2, mm2 //mm2 = 0xAAAA_AAAA_RRRR_RRRRpunpckhwd mm3, mm3 //mm3 = 0xaaaa_aaaa_rrrr_rrrrpunpckhdq mm2, mm2 //mm2 = 0xAAAA_AAAA_AAAA_AAAApunpckhdq mm3, mm3 //mm3 = 0xaaaa_aaaa_aaaa_aaaapand mm2, mm6 //mm2 = 0x00AA_00AA_00AA_00AApand mm3, mm6 //mm3 = 0x00aa_00aa_00aa_00aapmullw mm0, mm2 //自乘alpha,字组相乘,取低16位pmullw mm1, mm3psrlw mm0, 8 //除以256psrlw mm1, 8packuswb mm0, mm0 //合并单个象素packuswb mm1, mm1punpckldq mm0, mm1 //将2个象素合并pand mm0, mm5 //恢复原始alphapor mm0, mm4//put_pixel:movq [edi], mm0add esi, 8add edi, 8dec eax
test_pair_pixel:jnz loop_pair_pixel//rest_line_pixel:jecxz next_line //scanline_rest_pixel不是0就是1movq mm0, [esi] //mm0 = 0xaarrggbb_AARRGGBBmovq mm4, mm5 //mm4 = mm5 = 0x00ffffff_00ffffffmovq mm1, mm0 //mm1 = mm0 = 0xaarrggbb_AARRGGBBmovq mm2, mm0 //mm2 = mm0 = 0xaarrggbb_AARRGGBBpandn mm4, mm0 //保存alpha, mm4 = 0xaa000000_AA000000movq mm3, mm0 //mm3 = mm0 = 0xaarrggbb_AARRGGBBpunpckhbw mm1, mm7 //扩展每个通道, mm1 = 0x00aa_00rr_00gg_00bbpunpcklbw mm0, mm7 //mm0 = 0x00AA_00RR_00GG_00BBpunpcklbw mm2, mm2 //构建2个象素的alpha, mm2 = 0xAAAA_RRRR_GGGG_BBBBpunpckhbw mm3, mm3 //mm3 = 0xaaaa_rrrr_gggg_bbbbpunpckhwd mm2, mm2 //mm2 = 0xAAAA_AAAA_RRRR_RRRRpunpckhwd mm3, mm3 //mm3 = 0xaaaa_aaaa_rrrr_rrrrpunpckhdq mm2, mm2 //mm2 = 0xAAAA_AAAA_AAAA_AAAApunpckhdq mm3, mm3 //mm3 = 0xaaaa_aaaa_aaaa_aaaapand mm2, mm6 //mm2 = 0x00AA_00AA_00AA_00AApand mm3, mm6 //mm3 = 0x00aa_00aa_00aa_00aapmullw mm0, mm2 //自乘alpha,字组相乘,取低16位pmullw mm1, mm3psrlw mm0, 8 //除以256psrlw mm1, 8packuswb mm0, mm0 //合并单个象素packuswb mm1, mm1punpckldq mm0, mm1 //将2个象素合并pand mm0, mm5 //恢复原始alphapor mm0, mm4movd [edi], mm0add esi, 4add edi, 4next_line:add esi, src_pitch_sub_dst_pitch //设置指针到下一个src行dec edxjnz loop_lineemms //清除mmx指令状态end_pixel:}
}//< 32位光栅预乘Alpha,使用SSE指令处理
/************************************************************************/void BltSurface32ToDIB32_SelfMulAlphaSSE( void *pDst, void *pSrc, unsigned int width, unsigned int height, unsigned int src_pitch )
{int src_pitch_sub_dst_pitch;//src pointer对齐到下一行scanline,需要跳过多少字节__asm{//取参数,判断width和height是否有任一为0mov eax, height //eax = heighmov ebx, widthmul ebx //width * heighttest eax, eax //影响ZFjz end_pixel//常量赋值mov esi, pSrcmov edi, pDstpcmpeqd mm5, mm5 //mm5 = 0xffffffff_ffffffffpxor mm7, mm7 //mm7 = 0x0psrld mm5, 8 //mm5 = 0x00ffffff_00ffffff//判断pitchmov edx, src_pitchshl ebx, 2 //每个像素4个字节, dst_pitch = width * 4sub edx, ebx //src_pitch - dst_pitchjnz diff_pitch//same_pitch:mov ecx, eaxmov edx, 1 //how many lines,eax和edx构成2层循环and ecx, 1 //一行上剩下多少个不成对的象素,same_pitch时就是(width*height & 1),diff_pitch时就是(width & 1)shr eax, 1 //一行上主循环多少次,same_pitch时就是(width*height >> 1)diff_pitchh时就是(width >> 1)jmp test_pair_pixeldiff_pitch:mov src_pitch_sub_dst_pitch, edx //src_pitch - dst_pitchmov eax, widthmov edx, height //how many lines,eax和edx构成2层循环mov ecx, eaxand ecx, 1 //一行上剩下多少个不成对的象素,same_pitch时就是(width*height & 1),diff_pitch时就是(width & 1)shr eax, 1 //一行上主循环多少次,same_pitch时就是(width*height >> 1)diff_pitchh时就是(width >> 1)mov ebx, eax //main loop count on every scanlinejmp test_pair_pixelloop_line:mov eax, ebxloop_pair_pixel:movq mm0, [esi] //mm0 = 0xaarrggbb_AARRGGBBmovq mm4, mm5 //mm4 = mm5 = 0x00ffffff_00ffffffmovq mm1, mm0 //mm1 = mm0 = 0xaarrggbb_AARRGGBBpandn mm4, mm0 //保存alpha, mm4 = 0xaa000000_AA000000punpcklbw mm0, mm7 //mm0 = 0x00AA_00RR_00GG_00BBpunpckhbw mm1, mm7 //mm1 = 0x00aa_00rr_00gg_00bbpshufw mm2, mm0, 0xff //mm2 = 0x00AA_00AA_00AA_00AApshufw mm3, mm1, 0xff //mm3 = 0x00aa_00aa_00aa_00aapmullw mm0, mm2 //自乘alpha,字组相乘,取低16位pmullw mm1, mm3psrlw mm0, 8 //除以256psrlw mm1, 8packuswb mm0, mm0 //合并单个象素packuswb mm1, mm1punpckldq mm0, mm1 //将2个象素合并pand mm0, mm5 //恢复原始alphapor mm0, mm4//put_pixel:MOVNTQ [edi], mm0add esi, 8add edi, 8dec eax
test_pair_pixel:jnz loop_pair_pixel//rest_line_pixel:jecxz next_line //scanline_rest_pixel不是0就是1movq mm0, [esi] //mm0 = 0xaarrggbb_AARRGGBBmovq mm4, mm5 //mm4 = mm5 = 0x00ffffff_00ffffffmovq mm1, mm0 //mm1 = mm0 = 0xaarrggbb_AARRGGBBpandn mm4, mm0 //保存alpha, mm4 = 0xaa000000_AA000000punpcklbw mm0, mm7 //mm0 = 0x00AA_00RR_00GG_00BBpunpckhbw mm1, mm7 //mm1 = 0x00aa_00rr_00gg_00bbpshufw mm2, mm0, 0xff //mm2 = 0x00AA_00AA_00AA_00AApshufw mm3, mm1, 0xff //mm3 = 0x00aa_00aa_00aa_00aapmullw mm0, mm2 //自乘alpha,字组相乘,取低16位pmullw mm1, mm3psrlw mm0, 8 //除以256psrlw mm1, 8packuswb mm0, mm0 //合并单个象素packuswb mm1, mm1punpckldq mm0, mm1 //将2个象素合并pand mm0, mm5 //恢复原始alphapor mm0, mm4movd [edi], mm0add esi, 4add edi, 4next_line:add esi, src_pitch_sub_dst_pitch //设置指针到下一个src行dec edxjnz loop_lineemmsend_pixel:}
}//< 32位光栅预乘Alpha,使用SSE2指令处理
/************************************************************************/void BltSurface32ToDIB32_SelfMulAlphaSSE2( void *pDst, void *pSrc, unsigned int width, unsigned int height, unsigned int src_pitch )
{int src_pitch_sub_dst_pitch;//src pointer对齐到下一行scanline,需要跳过多少字节__asm{//取参数,判断width和height是否有任一为0mov eax, height //eax = heighmov ebx, widthmul ebx //width * heighttest eax, eax //影响ZFjz end_pixel//常量赋值mov esi, pSrcmov edi, pDstpcmpeqd xmm6, xmm6 //xmm6 = 0xffffffff_ffffffff_ffffffff_ffffffffxorps xmm7, xmm7 //xmm7 = 0x0psrld xmm6, 8 //xmm6 = 0x00ffffff_00ffffff_00ffffff_00ffffff//判断pitchmov edx, src_pitchshl ebx, 2 //每个像素4个字节, dst_pitch = width * 4cmp edx, ebx //src_pitch - dst_pitchjnz diff_pitch//same_pitch:mov ecx, eaxmov edx, 1 //how many lines,eax和edx构成2层循环shr eax, 2 //一行上主循环多少次,same_pitch时就是(width*height >> 2)diff_pitchh时就是(width >> 2)jmp test_quat_pixeldiff_pitch:and ebx, ~0x0fsub edx, ebx //src_pitch - dst_pitchmov src_pitch_sub_dst_pitch, edx //src_pitch - dst_pitchmov eax, widthmov edx, height //how many lines,eax和edx构成2层循环test eax, 3 //这里比较特殊,diff_pitch情况下,像素的个数必须是4的倍数,否则会导致Access Violation;因此如果遇到不是4的倍数,则退出jnz end_pixelshr eax, 2 //一行上主循环多少次,same_pitch时就是(width*height >> 2)diff_pitchh时就是(width >> 2)mov ebx, eax //main loop count on every scanlinejmp test_quat_pixelloop_line:mov eax, ebxloop_quat_pixel:MOVUPS xmm0, [esi] //xmm0 = 0xaarrggbb_AARRGGBB_wwxxyyzz_WWXXYYZZMOVAPS xmm5, xmm6 //xmm5 = xmm6 = 0x00ffffff_00ffffff_00ffffff_00ffffffMOVAPS xmm1, xmm0 //xmm1 = xmm0 = 0xaarrggbb_AARRGGBB_wwxxyyzz_WWXXYYZZandnps xmm5, xmm0 //xmm5 = 0xaa000000_AA000000_ww000000_WW000000, 保存alphapunpcklbw xmm0, xmm7 //xmm0 = 0x00ww_00xx_00yy_00zz_00WW_00XX_00YY_00ZZpunpckhbw xmm1, xmm7 //xmm1 = 0x00aa_00rr_00gg_00bb_00AA_00RR_00GG_00BBpshuflw xmm2, xmm0, 0xff //xmm2 = 0xww_00xx_00yy_00zz_00WW_00WW_00WW_00WWpshuflw xmm3, xmm1, 0xff //xmm3 = 0xaa_00rr_00gg_00bb_00AA_00AA_00AA_00AApshufhw xmm2, xmm2, 0xff //xmm2 = 0x00ww_00ww_00ww_00ww_00WW_00WW_00WW_00WWpshufhw xmm3, xmm3, 0xff //xmm3 = 0x00aa_00aa_00aa_00aa_00AA_00AA_00AA_00AApmullw xmm0, xmm2 //自乘alpha,字组相乘,取低16位pmullw xmm1, xmm3psrlw xmm0, 8 //除以256,取16位中的高位psrlw xmm1, 8packuswb xmm0, xmm0 //合并2个象素packuswb xmm1, xmm1PUNPCKLQDQ xmm0, xmm1 //将4个象素合并andps xmm0, xmm6 //恢复原始alphaorps xmm0, xmm5//put_pixel:MOVNTDQ [edi], xmm0add esi, 16add edi, 16dec eax
test_quat_pixel:jnz loop_quat_pixel//sse2模式下不处理剩余像素,非4的倍数;//next_line:add esi, src_pitch_sub_dst_pitch //设置指针到下一个src行dec edxjnz loop_lineend_pixel:}
}
更多推荐
游戏开发中常用光栅处理函数(二)
发布评论