старая процедура, требуется оптимизировать по скорости: Код (Text): call InitMMX4_7 mov edi, data64 call MMX_Calc1 ; preload MMX registers proc InitMMX4_7 push ebx mov ebx, t0x5555 movq mm7, qword [ebx] ;0x5555 movq mm6, qword [ebx+8] ;0x3333 movq mm5, qword [ebx+16] ;0x0F0F movq mm4, qword [ebx+24] ;0x00FF pop ebx ret endp ; edi - pointer to Data (2 qword's) ; eax, edx - Result first, second qword ; Use eax, edx, edi, mm0 - mm3, (mm4 - mm7 from InitMMX4_7) proc MMX_Calc1 movq mm0, qword [edi] ; movq mm2, qword [edi+64] ; ; pxor mm0, qword [ebp] ; load data ; movq mm1, mm0 ; movq mm3, mm2 ; psrld mm1, 1 ; psrld mm3, 1 ; pand mm0, mm7 ; pand mm2, mm7 ; first step pand mm1, mm7 ; pand mm3, mm7 ; paddd mm0, mm1 ; paddd mm2, mm3 ; movq mm1, mm0 ; load data movq mm3, mm2 ; pand mm0, mm6 ; pand mm2, mm6 ; psrld mm1, 2 ; psrld mm3, 2 ; second step pand mm1, mm6 ; pand mm3, mm6 ; paddd mm0, mm1 ; paddd mm2, mm3 ; movq mm1, mm0 ; load data movq mm3, mm2 ; psrld mm1, 4 psrld mm3, 4 pand mm0, mm5 pand mm2, mm5 pand mm1, mm5 pand mm3, mm5 paddd mm0, mm1 ; Result in every byte (8) paddd mm2, mm3 ; movq mm1, mm0 movq mm3, mm2 pand mm0, mm4 pand mm2, mm4 psrld mm1, 8 psrld mm3, 8 pand mm1, mm4 pand mm3, mm4 paddd mm0, mm1 ; Result in every word (4) paddd mm2, mm3 ; movq mm1, mm0 movq mm3, mm2 psrld mm0, 16 psrld mm2, 16 paddd mm1, mm0 ;mm1,mm3 00000000 00000000 paddd mm3, mm2 ;mm1,mm3 = Result1 Result2 movq mm0, mm1 movq mm2, mm3 psrlq mm0, 32 psrlq mm2, 32 paddw mm1, mm0 paddw mm3, mm2 movd eax, mm1 ; EAX = 00 R0 00 RA = result movd edx, mm3 and eax, 0x0000001F and edx, 0x0000001F ret endp t0x5555 dq 0x5555555555555555 t0x3333 dq 0x3333333333333333 t0x0F0F dq 0x0F0F0F0F0F0F0F0F t0x00FF dq 0x00FF00FF00FF00FF P.S. emms вставить по необходимости P.P.S пардон, это оптимизированный вариант