void static fmemcpy(void *dest, void *src, int n)
{
#if 1
__asm
{
mov edi,dest
mov esi,src
mov ecx,n
push ecx
// align 64 byte
and ecx,63
rep movsb
pop ecx
shr ecx,6
cmp ecx,0
je _mmx_copy_end
_mmx_copy:
movq mm0,[esi+0]
movq mm1,[esi+8]
movq [edi+0],mm0
movq [edi+8],mm1
movq mm2,[esi+16]
movq mm3,[esi+24]
movq [edi+16],mm2
movq [edi+24],mm3
movq mm0,[esi+32]
movq mm1,[esi+40]
movq [edi+32],mm0
movq [edi+40],mm1
movq mm2,[esi+48]
movq mm3,[esi+56]
movq [edi+48],mm2
movq [edi+56],mm3
add esi,64
add edi,64
dec ecx
jnz _mmx_copy
_mmx_copy_end:
emms
}
#else
__asm
{
mov esi, src
mov ecx, n
mov ebx, ecx
shr ebx, 11 // 2048 bytes at a time
mov edi, dest
loop2k: // Copy 2k into temporary buffer
push edi
mov edi, tbuf
mov ecx, 2048
shr ecx, 6
loopMemToL1:
prefetchnta 64[ESI] // Prefetch next loop, non-temporal
prefetchnta 96[ESI]
movq mm1, 0[ESI] // Read in source data
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]
movq 0[EDI], mm1 // Store into L1
movq 8[EDI], mm2
movq 16[EDI], mm3
movq 24[EDI], mm4
movq 32[EDI], mm5
movq 40[EDI], mm6
movq 48[EDI], mm7
movq 56[EDI], mm0
add esi, 64
add edi, 64
dec ecx
jnz loopMemToL1
pop edi // Now copy from L1 to system memory
push esi
mov esi, tbuf
mov ecx, 2048
shr ecx, 6
loopL1ToMem:
movq mm1, 0[ESI] // Read in source data from L1
movq mm2, 8[ESI]
movq mm3, 16[ESI]
movq mm4, 24[ESI]
movq mm5, 32[ESI]
movq mm6, 40[ESI]
movq mm7, 48[ESI]
movq mm0, 56[ESI]
movntq 0[EDI], mm1 // Non-temporal stores
movntq 8[EDI], mm2
movntq 16[EDI], mm3
movntq 24[EDI], mm4
movntq 32[EDI], mm5
movntq 40[EDI], mm6
movntq 48[EDI], mm7
movntq 56[EDI], mm0
add esi, 64
add edi, 64
dec ecx
jnz loopL1ToMem
pop esi // Do next 2k block
dec ebx
jnz loop2k
}
#endif
}