// Eric Goldstein's reverse-and-add implementation // // "The attached file contains the source // for my reverse-add-and-check algorithm." // // "It uses C and specific Intel P4 instructions, // and compiles under MSVC 6.0 with the latest // processor pack." // // (03/22/2003) char ReverseAddandCheck() { DWORD *j = &Length; char lc_Rem[128], *lcRem = lc_Rem; INT128 c14141414141414141414141414141414 = {0x14141414,0x14141414,0x14141414,0x14141414}; INT128 c76767676767676767676767676767676 = {0x76767676,0x76767676,0x76767676,0x76767676}; INT128 c7F7F7F7F7F7F7F7F7F7F7F7F7F7F7F7F = {0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F}; ULARGE_INTEGER cFFFFFFFFFFFFFFFF, cF6F6F6F6F6F6F6F6; cF6F6F6F6F6F6F6F6.LowPart = 0xF6F6F6F6; cF6F6F6F6F6F6F6F6.HighPart = 0xF6F6F6F6; cFFFFFFFFFFFFFFFF.LowPart = 0xFFFFFFFF; cFFFFFFFFFFFFFFFF.HighPart = 0xFFFFFFFF; __asm { xor eax, eax //default carry mov esi, Number1 mov ebx, j push ebx mov ebx, [ebx] mov ecx, ebx mov edx, ebx and edx, 127 push edx shr ecx, 7 jz add32_2_2 movdqu xmm4, c14141414141414141414141414141414 movdqu xmm5, c76767676767676767676767676767676 movdqu xmm6, c7F7F7F7F7F7F7F7F7F7F7F7F7F7F7F7F pxor mm7, mm7 movq mm3, cF6F6F6F6F6F6F6F6 movq mm4, cFFFFFFFFFFFFFFFF } add32_2_1: __asm { mov eax, [esi + ebx - 4] mov edx, [esi + ebx - 8] bswap eax bswap edx add eax, [esi] add edx, [esi + 4] movd mm0, eax movd mm6, edx punpckldq mm0, mm6 bswap eax bswap edx mov [esi + ebx - 4], eax mov [esi + ebx - 8], edx mov eax, [esi + ebx - 12] mov edx, [esi + ebx - 16] bswap eax bswap edx add eax, [esi + 8] add edx, [esi + 12] movd mm1, eax movd mm6, edx punpckldq mm1, mm6 bswap eax bswap edx mov [esi + ebx - 12], eax mov [esi + ebx - 16], edx psubb mm0, mm7 paddq mm0, mm3 movq mm7, mm0 pcmpgtb mm7, mm4 psrlq mm7, 56 psubb mm1, mm7 paddq mm1, mm3 movq2dq xmm0, mm0 movq2dq xmm7, mm1 punpcklqdq xmm0, xmm7 movq mm7, mm1 pcmpgtb mm7, mm4 psrlq mm7, 56 mov eax, [esi + ebx - 20] mov edx, [esi + ebx - 24] bswap eax bswap edx add eax, [esi + 16] add edx, [esi + 20] movd mm0, eax movd mm6, edx punpckldq mm0, mm6 bswap eax bswap edx mov [esi + ebx - 20], eax mov [esi + ebx - 24], edx mov eax, [esi + ebx - 28] mov edx, [esi + ebx - 32] bswap eax bswap edx add eax, [esi + 24] add edx, [esi + 28] movd mm1, eax movd mm6, edx punpckldq mm1, mm6 bswap eax bswap edx mov [esi + ebx - 28], eax mov [esi + ebx - 32], edx psubb mm0, mm7 paddq mm0, mm3 movq mm7, mm0 pcmpgtb mm7, mm4 psrlq mm7, 56 psubb mm1, mm7 paddq mm1, mm3 movq2dq xmm1, mm0 movq2dq xmm7, mm1 punpcklqdq xmm1, xmm7 movq mm7, mm1 pcmpgtb mm7, mm4 psrlq mm7, 56 mov eax, [esi + ebx - 36] mov edx, [esi + ebx - 40] bswap eax bswap edx add eax, [esi + 32] add edx, [esi + 36] movd mm0, eax movd mm6, edx punpckldq mm0, mm6 bswap eax bswap edx mov [esi + ebx - 36], eax mov [esi + ebx - 40], edx mov eax, [esi + ebx - 44] mov edx, [esi + ebx - 48] bswap eax bswap edx add eax, [esi + 40] add edx, [esi + 44] movd mm1, eax movd mm6, edx punpckldq mm1, mm6 bswap eax bswap edx mov [esi + ebx - 44], eax mov [esi + ebx - 48], edx psubb mm0, mm7 paddq mm0, mm3 movq mm7, mm0 pcmpgtb mm7, mm4 psrlq mm7, 56 psubb mm1, mm7 paddq mm1, mm3 movq2dq xmm2, mm0 movq2dq xmm7, mm1 punpcklqdq xmm2, xmm7 movq mm7, mm1 pcmpgtb mm7, mm4 psrlq mm7, 56 mov eax, [esi + ebx - 52] mov edx, [esi + ebx - 56] bswap eax bswap edx add eax, [esi + 48] add edx, [esi + 52] movd mm0, eax movd mm6, edx punpckldq mm0, mm6 bswap edx bswap eax movd mm2, edx //save the last two qwords in mm2 and mm5 instead of memory movd mm6, eax punpckldq mm2, mm6 mov eax, [esi + ebx - 60] mov edx, [esi + ebx - 64] bswap eax bswap edx add eax, [esi + 56] add edx, [esi + 60] movd mm1, eax movd mm6, edx punpckldq mm1, mm6 bswap edx bswap eax movd mm5, edx movd mm6, eax punpckldq mm5, mm6 psubb mm0, mm7 paddq mm0, mm3 movq mm7, mm0 pcmpgtb mm7, mm4 psrlq mm7, 56 psubb mm1, mm7 paddq mm1, mm3 movq2dq xmm3, mm0 movq2dq xmm7, mm1 punpcklqdq xmm3, xmm7 movq mm7, mm1 pcmpgtb mm7, mm4 psrlq mm7, 56 ///////////////////////////////////////////// pand xmm0, xmm6 pand xmm1, xmm6 pand xmm2, xmm6 pand xmm3, xmm6 movdqa xmm7, xmm0 pcmpgtb xmm7, xmm4 pand xmm7, xmm5 psubb xmm0, xmm7 movdqa xmm7, xmm1 pcmpgtb xmm7, xmm4 pand xmm7, xmm5 psubb xmm1, xmm7 movdqa xmm7, xmm2 pcmpgtb xmm7, xmm4 pand xmm7, xmm5 psubb xmm2, xmm7 movdqa xmm7, xmm3 pcmpgtb xmm7, xmm4 pand xmm7, xmm5 psubb xmm3, xmm7 movdqa [esi ], xmm0 movdqa [esi + 16], xmm1 movdqa [esi + 32], xmm2 movdqa [esi + 48], xmm3 //////////////////////////////////////////////////// paddq mm5, mm3 movq mm6, mm5 pcmpgtb mm6, mm4 psrlq mm6, 56 psubb mm2, mm6 paddq mm2, mm3 movq2dq xmm0, mm5 movq2dq xmm7, mm2 punpcklqdq xmm0, xmm7 movq mm6, mm2 pcmpgtb mm6, mm4 psrlq mm6, 56 movq mm0, [esi + ebx - 48] movq mm1, [esi + ebx - 40] psubb mm0, mm6 paddq mm0, mm3 movq mm6, mm0 pcmpgtb mm6, mm4 psrlq mm6, 56 psubb mm1, mm6 paddq mm1, mm3 movq2dq xmm1, mm0 movq2dq xmm7, mm1 punpcklqdq xmm1, xmm7 movq mm6, mm1 pcmpgtb mm6, mm4 psrlq mm6, 56 movq mm0, [esi + ebx - 32] movq mm1, [esi + ebx - 24] psubb mm0, mm6 paddq mm0, mm3 movq mm6, mm0 pcmpgtb mm6, mm4 psrlq mm6, 56 psubb mm1, mm6 paddq mm1, mm3 movq2dq xmm2, mm0 movq2dq xmm7, mm1 punpcklqdq xmm2, xmm7 movq mm6, mm1 pcmpgtb mm6, mm4 psrlq mm6, 56 movq mm0, [esi + ebx - 16] movq mm1, [esi + ebx - 8] psubb mm0, mm6 paddq mm0, mm3 movq mm6, mm0 pcmpgtb mm6, mm4 psrlq mm6, 56 psubb mm1, mm6 paddq mm1, mm3 movq2dq xmm3, mm0 movq2dq xmm7, mm1 punpcklqdq xmm3, xmm7 movq mm6, mm1 pcmpgtb mm6, mm4 psrlq mm6, 56 pand xmm0, xmm6 pand xmm1, xmm6 pand xmm2, xmm6 pand xmm3, xmm6 movdqa xmm7, xmm0 pcmpgtb xmm7, xmm4 pand xmm7, xmm5 psubb xmm0, xmm7 movdqa xmm7, xmm1 pcmpgtb xmm7, xmm4 pand xmm7, xmm5 psubb xmm1, xmm7 movdqa xmm7, xmm2 pcmpgtb xmm7, xmm4 pand xmm7, xmm5 psubb xmm2, xmm7 movdqa xmm7, xmm3 pcmpgtb xmm7, xmm4 pand xmm7, xmm5 psubb xmm3, xmm7 movdqu [esi + ebx - 64], xmm0 movdqu [esi + ebx - 48], xmm1 movdqu [esi + ebx - 32], xmm2 movdqu [esi + ebx - 16], xmm3 movd eax, mm6 and eax, 1 jz l3 //nocarry push ebx } l1: __asm { inc byte ptr [esi + ebx] cmp byte ptr [esi + ebx], 10 jl l2 sub byte ptr [esi + ebx], 10 inc ebx jnz l1 } l2: __asm { pop ebx } l3: __asm { //////////////////// sub ebx, 128 add esi, 64 dec ecx jnz add32_2_1 movd eax, mm7 //save carry and eax, 1 emms } add32_2_2: __asm { pop edx or edx, edx jz l100 mov ecx, edx mov ebx, edx mov edi, lcRem pushf push esi cld shr ecx, 2 rep movsd mov ecx, edx and ecx, 3 rep movsb pop esi popf bt eax, 0 mov edi, lcRem } l1000: __asm { dec edx mov al, [esi + ecx] adc al, [edi + edx] add al, 0xF6 jc l1001 sub al, 0xF6 } l1001: __asm { mov [esi + ecx], al inc ecx dec ebx jnz l1000 mov eax, 0 adc eax, 0 } l100: __asm { bt eax, 0 jnc l6 } l5: __asm { inc byte ptr [esi + ecx] cmp byte ptr [esi + ecx], 10 jl l6 sub byte ptr [esi + ecx], 10 inc ecx jnz l5 } l6: __asm { mov esi, Number1 pop ebx mov edx, [ebx] mov eax, [esi + edx] bt eax, 0 adc dword ptr [ebx], 0 //adjust length } __asm { mov ebx, [ebx] xor edx, edx mov ecx, ebx shr ecx, 1 mov ah, 0 } palinchk_1: __asm { dec ebx mov al, [esi + edx] cmp al, [esi + ebx] jnz palinchk_2 inc edx dec ecx jnz palinchk_1 mov ah, 1 } palinchk_2: __asm { mov al, ah //return value, 1=palindrome } }