最快的算法alphablend汇编源代码，Intel官方提供

Intel官方网站有一个ablend_565的快速汇编算法，理论上是是把一块32bit RGBA渲染到16bit的buffer上，我的机器是PIII800,函数在system menory中进行，640*480的256级alpha blending，达到100fps，我想可以满足绝大部分的要求了，在这里，我提供了这个算法的应用，希望可以对大家有所帮助。

ablend_565函数，源代码可以直接编译使用，无需其他库函数，感谢intel提供这么好的东西。

首先，我提供一些本人编写的把32bit tga文件读入pRGBABuffer的函数

文件尺寸保存在 width,height

//-----------------------------------------------------------------------

// Name: LoadTgaFile( TCHAR* strPathname, DWORD** pRGBABuffer, long* width, long* height )

// Desc: 读取32bit tga文件到DWORD缓冲里，返回其尺寸

// Time: 2002.06.22 00:36

// Author: RealRender

// Para:

// Return:

// Note: 这段代码来自directx 7.0 sample中的d3dtextr.cpp，我把他提取了出来

// 方便使用

//-----------------------------------------------------------------------

BOOL LoadTgaFile( TCHAR* strPathname, DWORD** pRGBABuffer, long* width, long* height )

{

FILE* file = fopen( strPathname, "rb" );

if( NULL == file )

return false;

struct TargaHeader

{

BYTE IDLength;

BYTE ColormapType;

BYTE ImageType;

BYTE ColormapSpecification[5];

WORD XOrigin;

WORD YOrigin;

WORD ImageWidth;

WORD ImageHeight;

BYTE PixelDepth;

BYTE ImageDescriptor;

} tga;

fread( &tga, sizeof(TargaHeader), 1, file );

// Only true color, non-mapped images are supported

if( ( 0 != tga.ColormapType ) ||

( tga.ImageType != 10 && tga.ImageType != 2 ) )

{

fclose( file );

return false;

}

// Skip the ID field. The first byte of the header is the length of this field

if( tga.IDLength )

fseek( file, tga.IDLength, SEEK_CUR );

DWORD m_dwWidth = tga.ImageWidth;

DWORD m_dwHeight = tga.ImageHeight;

DWORD m_dwBPP = tga.PixelDepth;

DWORD *m_pRGBAData = new DWORD[m_dwWidth*m_dwHeight];

if( m_pRGBAData == NULL )

{

fclose(file);

return false;

}

for( DWORD y=0; y

{

DWORD dwOffset = y*m_dwWidth;

if( 0 == ( tga.ImageDescriptor & 0x0010 ) )

dwOffset = (m_dwHeight-y-1)*m_dwWidth;

for( DWORD x=0; x

{

if( tga.ImageType == 10 )

{

BYTE PacketInfo = getc( file );

WORD PacketType = 0x80 & PacketInfo;

WORD PixelCount = ( 0x007f & PacketInfo ) + 1;

if( PacketType )

{

DWORD b = getc( file );

DWORD g = getc( file );

DWORD r = getc( file );

DWORD a = 0xff;

if( m_dwBPP == 32 )

a = getc( file );

while( PixelCount-- )

{

m_pRGBAData[dwOffset+x] = (r

x++;

}

else

{

while( PixelCount-- )

{

BYTE b = getc( file );

BYTE g = getc( file );

BYTE r = getc( file );

BYTE a = 0xff;

if( m_dwBPP == 32 )

a = getc( file );

m_pRGBAData[dwOffset+x] = (r

x++;

}

else

{

BYTE b = getc( file );

BYTE g = getc( file );

BYTE r = getc( file );

BYTE a = 0xff;

if( m_dwBPP == 32 )

a = getc( file );

m_pRGBAData[dwOffset+x] = (r

x++;

}

fclose( file );

// Check for alpha content

for( DWORD i=0; i

{

if( m_pRGBAData[i] & 0x000000ff != 0xff )

{

//m_bHasAlpha = TRUE;

break;

}

*pRGBABuffer = m_pRGBAData;

*width = m_dwWidth;

*height = m_dwHeight;

return true;

}

把32bit buffer分割为rgb和alpha的代码。

注意，分割后的pBitmap一定要是8字节对齐，这是优化的一个重要条件，所以，我的算法中：

BYTE* p = new BYTE[lSize*2+8];

BYTE* pOrig = p;

p += (DWORD)p%8;

WORD* color = (WORD*)p;

这是不规范的写法，把指针强行改变为8位对齐，实际使用的时候，要记住释放的原始指针不是p，而是pOrig，在这里，我没有释放分配的内存，请谅解。

//-----------------------------------------------------------------------

// Name: SplitRGBA( DWORD* pRGBABuffer, LPBYTE* pAlpha, LPWORD* pBitmap, long lWidth, long lHeight )

// Desc:

// Time: 2002.06.22 00:36

// Author: RealRender

// Para:

// Return:

// Note: 把从32bit的缓冲建立16bit的565缓冲和8bit的alpha通道

//-----------------------------------------------------------------------

void SplitRGBA( DWORD* pRGBABuffer, LPBYTE* pAlpha, LPWORD* pBitmap, long lWidth, long lHeight )

{

long lSize = lWidth*lHeight;

BYTE* alpha = new BYTE[lSize];

BYTE* p = new BYTE[lSize*2+8];

// 强行转换为8字节对齐

p += (DWORD)p%8;

WORD* color = (WORD*)p;

DWORD dwPixel;

DWORD r, g, b, a;

for( int i = 0; i

{

dwPixel = pRGBABuffer[i];

r = ((dwPixel24)&0x000000ff);

g = ((dwPixel16)&0x000000ff);

b = ((dwPixel 8)&0x000000ff);

a = ((dwPixel 0)&0x000000ff);

alpha[i] = a;

// 888i转化为565

color[i] = RGBTo16( r, g, b );

}

*pAlpha = alpha;

*pBitmap = color;

}

这个视intel官方提供的函数，函数的描述，用我的话来说就是把一个带有256级alpha通道的565颜色数据绘制到16位目标页面。

函数说明：

unsigned char *lpAlpha, // 256 级alpha通道

unsigned int iAlpPitch, // alpha通道的pitch

unsigned char *lpSrc, // 原色彩缓冲

unsigned int iSrcX, //

unsigned int iSrcY, // 原色彩位置

unsigned int iSrcPitch, // 原色彩pitch

unsigned char *lpDst, // 目标缓冲

unsigned int iDstX,

unsigned int iDstY, // 目标位置

unsigned int iDstW,

unsigned int iDstH, // 目标缓冲的尺寸

unsigned int iDstPitch // 目标缓冲的pitch

void ablend_565(unsigned char *lpAlpha,unsigned int iAlpPitch,

unsigned char *lpSrc,unsigned int iSrcX, unsigned int iSrcY,

unsigned int iSrcPitch, unsigned char *lpDst,

unsigned int iDstX, unsigned int iDstY,

unsigned int iDstW, unsigned int iDstH,

unsigned int iDstPitch)

{

//Mask for isolating the red,green, and blue components

static __int64 MASKB=0x001F001F001F001F;

static __int64 MASKG=0x07E007E007E007E0;

static __int64 MASKSHIFTG=0x03F003F003F003F0;

static __int64 MASKR=0xF800F800F800F800;

//constants used by the integer alpha blending equation

static __int64 SIXTEEN=0x0010001000100010;

static __int64 FIVETWELVE=0x0200020002000200;

static __int64 SIXONES=0x003F003F003F003F;

unsigned char *lpLinearDstBp=(iDstX

unsigned char *lpLinearSrcBp=(iSrcX

unsigned char *lpLinearAlpBp=iSrcX+(iSrcY*iAlpPitch)+lpAlpha; //base pointer for linear alpha

_asm{

mov esi,lpLinearSrcBp; //src

mov edi,lpLinearDstBp; //dst

mov eax,lpLinearAlpBp; //alpha

mov ecx,iDstH; //ecx=number of lines to copy

mov ebx,iDstW; //ebx=span width to copy

test esi,6; //check if source address is qword aligned

//since addr coming in is always word aligned(16bit)

jnz done; //if not qword a