游戏开发中常用光栅处理函数(二)

编程入门 行业动态 更新时间:2024-10-09 15:15:48

游戏开发中常用<a href=https://www.elefans.com/category/jswz/34/1742972.html style=光栅处理函数(二)"/>

游戏开发中常用光栅处理函数(二)

光栅预乘Alpha处理函数,像素格式为BBGGRRAA(BB为低地址), 各个版本都集齐了(MMX、SSE、SSE2),原先用于桌面透明窗口(UpdateLayeredWindow)。
注意几点: 1.  MMX版本一次处理2个pixel、SSE版本一次处理2个pixel(但是指令更简洁)、SSE2版本一次处理4个pixel, 所以效率层面是 MMX < SSE < SSE2。
2.  处理行内剩余像素的时候,指令会取“当前行末剩余像素” + “下一行首个像素”,当处理"最后一行"的时候, 可能会导致“读取内存越界”。一般情况下都不太会有问题,后面4个字节,取出来但是不会用。 MMX和SSE版本都是一次性处理2个pixel,如果光栅宽度是奇数,就会取 “下一行首个像素”。

3.  SSE2版本一次处理4个pixel,如果光栅宽度不是4的倍数,当前版本未做处理,不会导致“读取内存越界”, 但是效果上有缺陷。如果你要用SSE2版本,保证光栅宽度是4的倍数。
//< 32位光栅预乘Alpha,使用MMX指令处理
/************************************************************************/void    BltSurface32ToDIB32_SelfMulAlphaMMX( void *pDst, void *pSrc, unsigned int width, unsigned int height, unsigned int src_pitch )
{int    src_pitch_sub_dst_pitch; //src pointer对齐到下一行scanline,需要跳过多少字节__asm{//取参数,判断width和height是否有任一为0mov        eax, height     //eax = heighmov        ebx, widthmul        ebx             //width * heighttest       eax, eax        //影响ZFjz         end_pixel//常量赋值mov        esi, pSrcmov        edi, pDstpcmpeqd    mm5, mm5    //mm5 = 0xffffffff_ffffffffpcmpeqd    mm6, mm6    //mm6 = 0xffffffff_ffffffffpsrld      mm5, 8      //mm5 = 0x00ffffff_00ffffffpsrlw      mm6, 8      //mm6 = 0x00ff_00ff_00ff_00ffpxor       mm7, mm7    //mm7 = 0x0//判断pitchmov        edx, src_pitchshl        ebx, 2      //每个像素4个字节, dst_pitch = width * 4sub        edx, ebx    //src_pitch - dst_pitchjnz        diff_pitch//same_pitch:mov        ecx, eaxmov        edx, 1        //how many lines,eax和edx构成2层循环and        ecx, 1        //一行上剩下多少个不成对的象素,same_pitch时就是(width*height & 1),diff_pitch时就是(width & 1)shr        eax, 1        //一行上主循环多少次,same_pitch时就是(width*height >> 1)diff_pitchh时就是(width >> 1)jmp        test_pair_pixeldiff_pitch:mov        src_pitch_sub_dst_pitch, edx    //src_pitch - dst_pitchmov        eax, widthmov        edx, height   //how many lines,eax和edx构成2层循环mov        ecx, eaxand        ecx, 1        //一行上剩下多少个不成对的象素,same_pitch时就是(width*height & 1),diff_pitch时就是(width & 1)shr        eax, 1        //一行上主循环多少次,same_pitch时就是(width*height >> 1)diff_pitchh时就是(width >> 1)mov        ebx, eax      //main loop count on every scanlinejmp        test_pair_pixelloop_line:mov        eax, ebxloop_pair_pixel:movq   mm0, [esi]      //mm0 = 0xaarrggbb_AARRGGBBmovq   mm4, mm5        //mm4 = mm5 = 0x00ffffff_00ffffffmovq   mm1, mm0        //mm1 = mm0 = 0xaarrggbb_AARRGGBBmovq   mm2, mm0        //mm2 = mm0 = 0xaarrggbb_AARRGGBBpandn  mm4, mm0        //保存alpha, mm4 = 0xaa000000_AA000000movq   mm3, mm0        //mm3 = mm0 = 0xaarrggbb_AARRGGBBpunpckhbw  mm1, mm7    //扩展每个通道, mm1 = 0x00aa_00rr_00gg_00bbpunpcklbw  mm0, mm7    //mm0 = 0x00AA_00RR_00GG_00BBpunpcklbw  mm2, mm2    //构建2个象素的alpha, mm2 = 0xAAAA_RRRR_GGGG_BBBBpunpckhbw  mm3, mm3    //mm3 = 0xaaaa_rrrr_gggg_bbbbpunpckhwd  mm2, mm2    //mm2 = 0xAAAA_AAAA_RRRR_RRRRpunpckhwd  mm3, mm3    //mm3 = 0xaaaa_aaaa_rrrr_rrrrpunpckhdq  mm2, mm2    //mm2 = 0xAAAA_AAAA_AAAA_AAAApunpckhdq  mm3, mm3    //mm3 = 0xaaaa_aaaa_aaaa_aaaapand   mm2, mm6        //mm2 = 0x00AA_00AA_00AA_00AApand   mm3, mm6        //mm3 = 0x00aa_00aa_00aa_00aapmullw mm0, mm2        //自乘alpha,字组相乘,取低16位pmullw mm1, mm3psrlw  mm0, 8          //除以256psrlw  mm1, 8packuswb mm0, mm0      //合并单个象素packuswb mm1, mm1punpckldq mm0, mm1     //将2个象素合并pand   mm0, mm5        //恢复原始alphapor    mm0, mm4//put_pixel:movq [edi], mm0add esi, 8add edi, 8dec eax
test_pair_pixel:jnz    loop_pair_pixel//rest_line_pixel:jecxz    next_line     //scanline_rest_pixel不是0就是1movq   mm0, [esi]      //mm0 = 0xaarrggbb_AARRGGBBmovq   mm4, mm5        //mm4 = mm5 = 0x00ffffff_00ffffffmovq   mm1, mm0        //mm1 = mm0 = 0xaarrggbb_AARRGGBBmovq   mm2, mm0        //mm2 = mm0 = 0xaarrggbb_AARRGGBBpandn  mm4, mm0        //保存alpha, mm4 = 0xaa000000_AA000000movq   mm3, mm0        //mm3 = mm0 = 0xaarrggbb_AARRGGBBpunpckhbw  mm1, mm7    //扩展每个通道, mm1 = 0x00aa_00rr_00gg_00bbpunpcklbw  mm0, mm7    //mm0 = 0x00AA_00RR_00GG_00BBpunpcklbw  mm2, mm2    //构建2个象素的alpha, mm2 = 0xAAAA_RRRR_GGGG_BBBBpunpckhbw  mm3, mm3    //mm3 = 0xaaaa_rrrr_gggg_bbbbpunpckhwd  mm2, mm2    //mm2 = 0xAAAA_AAAA_RRRR_RRRRpunpckhwd  mm3, mm3    //mm3 = 0xaaaa_aaaa_rrrr_rrrrpunpckhdq  mm2, mm2    //mm2 = 0xAAAA_AAAA_AAAA_AAAApunpckhdq  mm3, mm3    //mm3 = 0xaaaa_aaaa_aaaa_aaaapand   mm2, mm6        //mm2 = 0x00AA_00AA_00AA_00AApand   mm3, mm6        //mm3 = 0x00aa_00aa_00aa_00aapmullw mm0, mm2        //自乘alpha,字组相乘,取低16位pmullw mm1, mm3psrlw   mm0, 8         //除以256psrlw   mm1, 8packuswb  mm0, mm0     //合并单个象素packuswb  mm1, mm1punpckldq mm0, mm1     //将2个象素合并pand   mm0, mm5        //恢复原始alphapor    mm0, mm4movd    [edi], mm0add esi, 4add edi, 4next_line:add    esi, src_pitch_sub_dst_pitch    //设置指针到下一个src行dec    edxjnz    loop_lineemms  //清除mmx指令状态end_pixel:}
}//< 32位光栅预乘Alpha,使用SSE指令处理
/************************************************************************/void    BltSurface32ToDIB32_SelfMulAlphaSSE( void *pDst, void *pSrc, unsigned int width, unsigned int height, unsigned int src_pitch )
{int    src_pitch_sub_dst_pitch;//src pointer对齐到下一行scanline,需要跳过多少字节__asm{//取参数,判断width和height是否有任一为0mov     eax, height    //eax = heighmov     ebx, widthmul     ebx            //width * heighttest    eax, eax       //影响ZFjz      end_pixel//常量赋值mov     esi, pSrcmov     edi, pDstpcmpeqd mm5, mm5       //mm5 = 0xffffffff_ffffffffpxor    mm7, mm7       //mm7 = 0x0psrld   mm5, 8         //mm5 = 0x00ffffff_00ffffff//判断pitchmov        edx, src_pitchshl        ebx, 2      //每个像素4个字节, dst_pitch = width * 4sub        edx, ebx    //src_pitch - dst_pitchjnz        diff_pitch//same_pitch:mov     ecx, eaxmov     edx, 1         //how many lines,eax和edx构成2层循环and     ecx, 1         //一行上剩下多少个不成对的象素,same_pitch时就是(width*height & 1),diff_pitch时就是(width & 1)shr     eax, 1         //一行上主循环多少次,same_pitch时就是(width*height >> 1)diff_pitchh时就是(width >> 1)jmp     test_pair_pixeldiff_pitch:mov     src_pitch_sub_dst_pitch, edx    //src_pitch - dst_pitchmov     eax, widthmov     edx, height    //how many lines,eax和edx构成2层循环mov     ecx, eaxand     ecx, 1         //一行上剩下多少个不成对的象素,same_pitch时就是(width*height & 1),diff_pitch时就是(width & 1)shr     eax, 1         //一行上主循环多少次,same_pitch时就是(width*height >> 1)diff_pitchh时就是(width >> 1)mov     ebx, eax       //main loop count on every scanlinejmp     test_pair_pixelloop_line:mov     eax, ebxloop_pair_pixel:movq   mm0, [esi]      //mm0 = 0xaarrggbb_AARRGGBBmovq   mm4, mm5        //mm4 = mm5 = 0x00ffffff_00ffffffmovq   mm1, mm0        //mm1 = mm0 = 0xaarrggbb_AARRGGBBpandn  mm4, mm0        //保存alpha, mm4 = 0xaa000000_AA000000punpcklbw mm0, mm7     //mm0 = 0x00AA_00RR_00GG_00BBpunpckhbw mm1, mm7     //mm1 = 0x00aa_00rr_00gg_00bbpshufw    mm2, mm0, 0xff    //mm2 = 0x00AA_00AA_00AA_00AApshufw    mm3, mm1, 0xff    //mm3 = 0x00aa_00aa_00aa_00aapmullw    mm0, mm2     //自乘alpha,字组相乘,取低16位pmullw    mm1, mm3psrlw   mm0, 8         //除以256psrlw   mm1, 8packuswb mm0, mm0      //合并单个象素packuswb mm1, mm1punpckldq mm0, mm1     //将2个象素合并pand   mm0, mm5        //恢复原始alphapor    mm0, mm4//put_pixel:MOVNTQ [edi], mm0add esi, 8add edi, 8dec eax
test_pair_pixel:jnz    loop_pair_pixel//rest_line_pixel:jecxz    next_line    //scanline_rest_pixel不是0就是1movq   mm0, [esi]     //mm0 = 0xaarrggbb_AARRGGBBmovq   mm4, mm5       //mm4 = mm5 = 0x00ffffff_00ffffffmovq   mm1, mm0       //mm1 = mm0 = 0xaarrggbb_AARRGGBBpandn  mm4, mm0       //保存alpha, mm4 = 0xaa000000_AA000000punpcklbw mm0, mm7    //mm0 = 0x00AA_00RR_00GG_00BBpunpckhbw mm1, mm7    //mm1 = 0x00aa_00rr_00gg_00bbpshufw  mm2, mm0, 0xff    //mm2 = 0x00AA_00AA_00AA_00AApshufw  mm3, mm1, 0xff    //mm3 = 0x00aa_00aa_00aa_00aapmullw  mm0, mm2      //自乘alpha,字组相乘,取低16位pmullw  mm1, mm3psrlw   mm0, 8        //除以256psrlw   mm1, 8packuswb mm0, mm0     //合并单个象素packuswb mm1, mm1punpckldq mm0, mm1    //将2个象素合并pand   mm0, mm5       //恢复原始alphapor    mm0, mm4movd   [edi], mm0add esi, 4add edi, 4next_line:add    esi, src_pitch_sub_dst_pitch    //设置指针到下一个src行dec    edxjnz    loop_lineemmsend_pixel:}
}//< 32位光栅预乘Alpha,使用SSE2指令处理
/************************************************************************/void    BltSurface32ToDIB32_SelfMulAlphaSSE2( void *pDst, void *pSrc, unsigned int width, unsigned int height, unsigned int src_pitch )
{int    src_pitch_sub_dst_pitch;//src pointer对齐到下一行scanline,需要跳过多少字节__asm{//取参数,判断width和height是否有任一为0mov     eax, height     //eax = heighmov     ebx, widthmul     ebx             //width * heighttest    eax, eax        //影响ZFjz      end_pixel//常量赋值mov     esi, pSrcmov     edi, pDstpcmpeqd xmm6, xmm6     //xmm6 = 0xffffffff_ffffffff_ffffffff_ffffffffxorps   xmm7, xmm7     //xmm7 = 0x0psrld   xmm6, 8        //xmm6 = 0x00ffffff_00ffffff_00ffffff_00ffffff//判断pitchmov     edx, src_pitchshl     ebx, 2         //每个像素4个字节, dst_pitch = width * 4cmp     edx, ebx       //src_pitch - dst_pitchjnz     diff_pitch//same_pitch:mov     ecx, eaxmov     edx, 1         //how many lines,eax和edx构成2层循环shr     eax, 2         //一行上主循环多少次,same_pitch时就是(width*height >> 2)diff_pitchh时就是(width >> 2)jmp     test_quat_pixeldiff_pitch:and    ebx, ~0x0fsub    edx, ebx        //src_pitch - dst_pitchmov    src_pitch_sub_dst_pitch, edx    //src_pitch - dst_pitchmov    eax, widthmov    edx, height     //how many lines,eax和edx构成2层循环test   eax, 3          //这里比较特殊,diff_pitch情况下,像素的个数必须是4的倍数,否则会导致Access Violation;因此如果遇到不是4的倍数,则退出jnz    end_pixelshr    eax, 2          //一行上主循环多少次,same_pitch时就是(width*height >> 2)diff_pitchh时就是(width >> 2)mov    ebx, eax        //main loop count on every scanlinejmp    test_quat_pixelloop_line:mov    eax, ebxloop_quat_pixel:MOVUPS  xmm0, [esi]      //xmm0 = 0xaarrggbb_AARRGGBB_wwxxyyzz_WWXXYYZZMOVAPS  xmm5, xmm6       //xmm5 = xmm6 = 0x00ffffff_00ffffff_00ffffff_00ffffffMOVAPS  xmm1, xmm0       //xmm1 = xmm0 = 0xaarrggbb_AARRGGBB_wwxxyyzz_WWXXYYZZandnps  xmm5, xmm0       //xmm5 = 0xaa000000_AA000000_ww000000_WW000000, 保存alphapunpcklbw  xmm0, xmm7    //xmm0 = 0x00ww_00xx_00yy_00zz_00WW_00XX_00YY_00ZZpunpckhbw  xmm1, xmm7    //xmm1 = 0x00aa_00rr_00gg_00bb_00AA_00RR_00GG_00BBpshuflw    xmm2, xmm0, 0xff    //xmm2 = 0xww_00xx_00yy_00zz_00WW_00WW_00WW_00WWpshuflw    xmm3, xmm1, 0xff    //xmm3 = 0xaa_00rr_00gg_00bb_00AA_00AA_00AA_00AApshufhw    xmm2, xmm2, 0xff    //xmm2 = 0x00ww_00ww_00ww_00ww_00WW_00WW_00WW_00WWpshufhw    xmm3, xmm3, 0xff    //xmm3 = 0x00aa_00aa_00aa_00aa_00AA_00AA_00AA_00AApmullw  xmm0, xmm2       //自乘alpha,字组相乘,取低16位pmullw  xmm1, xmm3psrlw   xmm0, 8          //除以256,取16位中的高位psrlw   xmm1, 8packuswb xmm0, xmm0      //合并2个象素packuswb xmm1, xmm1PUNPCKLQDQ xmm0, xmm1    //将4个象素合并andps xmm0, xmm6         //恢复原始alphaorps  xmm0, xmm5//put_pixel:MOVNTDQ [edi], xmm0add esi, 16add edi, 16dec eax
test_quat_pixel:jnz    loop_quat_pixel//sse2模式下不处理剩余像素,非4的倍数;//next_line:add    esi, src_pitch_sub_dst_pitch    //设置指针到下一个src行dec    edxjnz    loop_lineend_pixel:}
}



更多推荐

游戏开发中常用光栅处理函数(二)

本文发布于:2024-02-14 07:39:14,感谢您对本站的认可!
本文链接:https://www.elefans.com/category/jswz/34/1762294.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文标签:光栅   游戏开发   函数   常用

发布评论

评论列表 (有 0 条评论)
草根站长

>www.elefans.com

编程频道|电子爱好者 - 技术资讯及电子产品介绍!