// The code in this file is only used for x86 compilation. // For x64 mode the file Int128x64.asm should be used instead. // They execute much faster, because they use 64-bit registers, // and division functions are coded entirely in assembler. #ifndef _M_X64 #include "Int128.h" typedef unsigned int UINT; void int128add( void *dst, const void *x) { __asm { mov esi, x mov edi, dst mov eax, dword ptr[esi] add dword ptr[edi], eax mov eax, dword ptr[esi+4] adc dword ptr[edi+4], eax mov eax, dword ptr[esi+8] adc dword ptr[edi+8], eax mov eax, dword ptr[esi+12] adc dword ptr[edi+12], eax } } void int128sub(void *dst, const void *x) { __asm { mov esi, x mov edi, dst mov eax, dword ptr[esi] sub dword ptr[edi], eax mov eax, dword ptr[esi+4] sbb dword ptr[edi+4], eax mov eax, dword ptr[esi+8] sbb dword ptr[edi+8], eax mov eax, dword ptr[esi+12] sbb dword ptr[edi+12], eax } } void int128inc(void *x) { __asm { mov edi, x add dword ptr[edi], 1 jnc Done adc dword ptr[edi+4], 0 adc dword ptr[edi+8], 0 adc dword ptr[edi+12], 0 Done: } } void int128dec(void *x) { __asm { mov edi, x sub dword ptr[edi], 1 jnc Done sbb dword ptr[edi+4], 0 sbb dword ptr[edi+8], 0 sbb dword ptr[edi+12], 0 Done: } } #pragma warning(disable:4731) void int128mul(void *dst, const void *x) { _int128 *dp = (_int128*)dst; const _int128 *b = (const _int128*)x; if (!(dp->s4.i[1] || b->s4.i[1] || dp->s4.i[2] || b->s4.i[2] || dp->s4.i[3] || b->s4.i[3])) { HI64(*dp) = 0; LO64(*dp) = __int64(dp->s4.i[0]) * b->s4.i[0]; // simple _int64 multiplication. int32 * int32 } else { const _int128 a(*dp); __asm { push ebp lea ebx, a // ebx = &a mov ecx, b // ecx = b mov ebp, dst // ebp = &dp (dst) xor eax, eax mov dword ptr[ebp+8 ], eax // dst[2..3] = 0 mov dword ptr[ebp+12], eax mov eax, dword ptr[ebx] // 1. round mul dword ptr[ecx] // [edx:eax] = x[0]*y[0] mov dword ptr[ebp] , eax mov dword ptr[ebp+4], edx // dst[0:1] = [edx:eax] mov eax, dword ptr[ebx] // 2. round mul dword ptr[ecx+4] // [edx:eax] = x[0]*y[1] mov esi, eax // mov edi, edx // [edi:esi] = [edx:eax] mov eax, dword ptr[ebx+4] mul dword ptr[ecx] // [edx:eax] = x[1]*y[0] add esi, eax // adc edi, edx // [edi:esi] += [edx:eax] adc dword ptr[ebp+12],0 // may be extra carry for dst[3] add dword ptr[ebp+4], esi adc dword ptr[ebp+8], edi // dst[1:2] += [edi:esi] mov eax, dword ptr[ebx] // 3. round. Extra carries are overflow mul dword ptr[ecx+8] // [edx:eax] = x[0]*y[2] mov esi, eax // mov edi, edx // [edi:esi] = [edx:eax] mov eax, dword ptr[ebx+4] mul dword ptr[ecx+4] // [edx:eax] = x[1]*y[1] add esi, eax // adc edi, edx // [edi:esi] += [edx:eax] mov eax, dword ptr[ebx+8] mul dword ptr[ecx] // [edx:eax] = x[2]*y[0] add esi, eax // adc edi, edx // [edi:esi] += [edx:eax] add dword ptr[ebp+8 ], esi adc dword ptr[ebp+12], edi // dst[2:3] += [edi:esi] mov eax, dword ptr[ebx] // 4. round. High end and carries are overflow mul dword ptr[ecx+12] // [edx:eax] = x[0]*y[3] mov esi, eax // esi = eax mov eax, dword ptr[ebx+4] // mul dword ptr[ecx+8] // [edx:eax] = x[1]*y[2] add esi, eax // esi += eax mov eax, dword ptr[ebx+8] mul dword ptr[ecx+4] // [edx:eax] = x[2]*y[1] add esi, eax // esi += eax mov eax, dword ptr[ebx+12] mul dword ptr[ecx] // [edx:eax] = x[3]*y[0] add esi, eax // esi += eax add dword ptr[ebp+12], esi // dst[3] += esi pop ebp } } } void int128shl(int shft, void *x) { __asm { mov ecx, shft ; ecx = shift count mov esi, x ; esi = x cmp cl, 64 jae GE64 cmp cl, 32 ; cl < 64 jae _32_63 mov eax, dword ptr[esi+8] ; cl < 32 shld dword ptr[esi+12], eax, cl ; shift s4.i[3] adding bits from s4.i[2] mov eax, dword ptr[esi+4] shld dword ptr[esi+8], eax, cl ; shift s4.i[2] adding bits from s4.i[1] mov eax, dword ptr[esi] shld dword ptr[esi+4], eax, cl ; shift s4.i[1] adding bits from s4.i[0] shl dword ptr[esi], cl ; shift s4.i[0] jmp End _32_63 : ; 32 <= cl < 64 and cl, 1Fh ; cl %= 32 mov eax, dword ptr[esi+8] mov dword ptr[esi+12], eax ; s4.i[3] = s4.i[1] mov eax, dword ptr[esi+4] shld dword ptr[esi+12], eax, cl ; shift s4.i[3] adding bits eax mov dword ptr[esi+8], eax ; s4.i[2] = s4.i[1] mov eax, dword ptr[esi] ; eax = s4.i[0] shld dword ptr[esi+8], eax, cl ; shift s4.i[2] adding bits eax shl eax, cl mov dword ptr[esi+4], eax ; shift s4.i[1] eax << cl (old i[0] << cl) xor eax, eax mov dword ptr[esi], eax ; s4.i[0] = 0 jmp End GE64: ; cl >= 64 cmp cl, 96 jae GE96 and cl, 1Fh ; cl < 96; cl %= 32 mov eax, dword ptr[esi+4] mov dword ptr[esi+12], eax ; s4.i[3] = s4.i[1] mov eax, dword ptr[esi] ; eax = s4.i[0] shld dword ptr[esi+12], eax, cl ; shift s4.i[3] adding bits eax shl eax, cl mov dword ptr[esi+8], eax ; shift s4.i[2] eax << cl xor eax, eax mov dword ptr[esi], eax ; s4.i[0] = 0 mov dword ptr[esi+4], eax ; s4.i[1] = 0 jmp End GE96: ; cl >= 96 cmp cl, 128 jae GE128 and cl, 1Fh ; cl < 128; cl %= 32 mov eax, dword ptr[esi] shl eax, cl mov dword ptr[esi+12], eax ; s4.i[3] = s4.i[0] << cl xor eax, eax mov dword ptr[esi], eax ; s4.i[0] = 0 mov dword ptr[esi+4], eax ; s4.i[1] = 0 mov dword ptr[esi+8], eax ; s4.i[2] = 0 jmp End GE128: ; cl >= 128 xor eax, eax ; return 0 mov dword ptr[esi ], eax mov dword ptr[esi+4 ], eax mov dword ptr[esi+8 ], eax mov dword ptr[esi+12], eax } End:; } void int128shr(int shft, void *x) { // signed shift right __asm { mov ecx, shft ; ecx = shift count mov esi, x ; esi = x cmp cl, 64 jae GE64 cmp cl, 32 ; cl < 64 jae _32_63 mov eax, dword ptr[esi+4] shrd dword ptr[esi], eax, cl ; shift s4[0] new bits from s4[1] (eax) mov eax, dword ptr[esi+8] shrd dword ptr[esi+4], eax, cl ; shift s4[1] new bits from s4[2] (eax) mov eax, dword ptr[esi+12] shrd dword ptr[esi+8], eax, cl ; shift s4[2] new bits from s4[3] (eax) sar dword ptr[esi+12], cl ; shift s4[3] jmp End _32_63 : ; 32 <= cl < 64 and cl, 1Fh ; cl %= 32 mov eax, dword ptr[esi+4] mov dword ptr[esi], eax ; s4[0] = eax (old s4[1]) mov eax, dword ptr[esi+8] ; eax = s4[2] shrd dword ptr[esi], eax, cl ; shift s[0] new bits from s4[2] (eax) mov dword ptr[esi+4], eax ; s4[1] = eax (old s4[2]) mov eax, dword ptr[esi+12] ; eax = s4[3] shrd dword ptr[esi+4], eax, cl ; shift s4[1] new bits from eax (old s4[3]) mov dword ptr[esi+8], eax ; s4[2] = eax (old s4[3]) mov eax, dword ptr[esi+12] ; eax = s4[3] sar eax, 1fh ; eax contain signbit in all bits shrd dword ptr[esi+8], eax, cl ; shift s4[1] new bits from eax (sign of s4[3]) mov dword ptr[esi+12], eax ; s4[3] = eax jmp End GE64: ; cl >= 64 cmp cl, 96 jae GE96 and cl, 1Fh ; cl < 96; cl %= 32 mov eax, dword ptr[esi+8] mov dword ptr[esi], eax ; s4[0] = s4[2] mov eax, dword ptr[esi+12] ; eax = s4[3] shrd dword ptr[esi], eax, cl ; shift s4[0] new bits from eax (old s4[3]) sar eax, cl ; eax contain sign bit in all bits mov dword ptr[esi+4], eax ; s4[1] = eax (old s4[3]) sar eax, 1fh mov dword ptr[esi+8], eax ; s4[2] = sign of s4[3] mov dword ptr[esi+12], eax ; s4[3] = sign of s4[3] jmp End GE96: ; cl >= 96 cmp cl, 128 jae GE128 and cl, 1Fh ; cl < 128; cl %= 32 mov eax, dword ptr[esi+12] sar eax, cl mov dword ptr[esi], eax ; s4[0] = s4[3] >> cl (shift in signbit of s4[3]) sar eax, 1fh mov dword ptr[esi+4 ], eax ; s4[1] = sign of s4[3] mov dword ptr[esi+8 ], eax ; s4[2] = sign of s4[3] mov dword ptr[esi+12], eax ; s4[3] = sign of s4[3] jmp End GE128: ; cl >= 128 mov eax, dword ptr[esi+12] sar eax,1Fh mov dword ptr[esi ], eax mov dword ptr[esi+4 ], eax mov dword ptr[esi+8 ], eax mov dword ptr[esi+12], eax } End:; } void uint128shr(int shft, void *x) { // unsigned shift right __asm { mov ecx, shft ; ecx = shift count mov esi, x cmp cl, 64 jae GE64 cmp cl, 32 ; cl < 64 jae _32_63 mov eax, dword ptr[esi+4] shrd dword ptr[esi], eax, cl ; shift s4[0] new bits from s4[1] (eax) mov eax, dword ptr[esi+8] shrd dword ptr[esi+4], eax, cl ; shift s4[1] new bits from s4[2] (eax) mov eax, dword ptr[esi+12] shrd dword ptr[esi+8], eax, cl ; shift s4[2] new bits from s4[3] (eax) shr dword ptr[esi+12], cl ; shift s4[3] jmp End _32_63 : ; 32 <= cl < 64 and cl, 1Fh ; cl %= 32 mov eax, dword ptr[esi+4] mov dword ptr[esi], eax ; s4[0] = eax (old s4[1]) mov eax, dword ptr[esi+8] ; eax = s4[2] shrd dword ptr[esi], eax, cl ; shift s[0] new bits from s4[2] (eax) mov dword ptr[esi+4], eax ; s4[1] = eax (old s4[2]) mov eax, dword ptr[esi+12] ; eax = s4[3] shrd dword ptr[esi+4], eax, cl ; shift s4[1] new bits from eax (old s4[3]) shr eax, cl mov dword ptr[esi+8], eax ; s4[2] = eax << cl xor eax, eax mov dword ptr[esi+12], eax ; s4[3] = 0 jmp End GE64: ; cl >= 64 cmp cl, 96 jae GE96 and cl, 1Fh ; cl < 96; cl %= 32 mov eax, dword ptr[esi+8] mov dword ptr[esi], eax ; s4[0] = s4[2] mov eax, dword ptr[esi+12] ; eax = s4[3] shrd dword ptr[esi], eax, cl ; shift s4[0] new bits from eax (old s4[3]) shr eax, cl mov dword ptr[esi+4], eax ; s4[1] = eax >> cl (old s4[3] >> cl) xor eax, eax mov dword ptr[esi+8], eax ; s4[2] = 0 mov dword ptr[esi+12], eax ; s4[3] = 0 jmp End GE96: ; cl >= 96 cmp cl, 128 jae GE128 and cl, 1Fh ; cl < 128; cl %= 32 mov eax, dword ptr[esi+12] shr eax, cl mov dword ptr[esi], eax ; s4[0] = s4[3] >> cl xor eax, eax mov dword ptr[esi+4 ], eax ; s4[1] = 0 mov dword ptr[esi+8 ], eax ; s4[2] = 0 mov dword ptr[esi+12], eax ; s4[3] = 0 jmp End GE128: ; cl >= 128 xor eax,eax ; return 0 mov dword ptr[esi ], eax mov dword ptr[esi+4 ], eax mov dword ptr[esi+8 ], eax mov dword ptr[esi+12], eax } End:; } static UINT getFirst16(const _uint128 &n, int &expo2) { UINT result, expo; __asm { pushf mov ecx , 4 mov edi , n add edi , 12 xor eax , eax std repe scasd jnz SearchBit xor edx , edx jmp End1Result SearchBit: ; assume ecx hold the index of integer with first 1-bit mov edx , dword ptr [edi+4] bsr eax , edx ; eax holds index of highest 1 bit cmp eax , 15 je End2Results jg TooManyBits test ecx , ecx ; Get some bits from previous, if we got some je End2Results shl ecx , 5 add ecx , eax mov expo, ecx mov edi , dword ptr[edi] ; edi = previous int mov ecx , 15 sub ecx , eax ; ecx = 15 - index of higest 1-bit shld edx , edi, cl ; Shift edx left adding new bits from previous digit (edi) jmp End1Result TooManyBits: ; assume eax = index of higest 1-bit in edx (>15) shl ecx , 5 add ecx , eax mov expo, ecx mov ecx , eax sub ecx , 15 shr edx , cl jmp End1Result End2Results: shl ecx , 5 add ecx , eax mov expo, ecx End1Result: mov result, edx popf } expo2 = expo; return result; } static UINT getFirst32(const _uint128 &n, int &expo2) { UINT result, expo; __asm { pushf mov ecx, 4 mov edi, n add edi, 12 xor eax, eax std repe scasd jnz SearchBit xor edx, edx jmp End1Result SearchBit: ; assume ecx hold the index of integer with first 1-bit mov edx, dword ptr[edi+4] bsr eax, edx ; eax holds index of highest 1 bit cmp eax, 31 je End2Results test ecx, ecx ; Get some bits from next, if we got some je End2Results shl ecx, 5 add ecx, eax mov expo, ecx mov edi, dword ptr[edi] ; edi = previous int mov ecx, 31 sub ecx, eax ; ecx = 31 - bits already in edx shld edx, edi, cl ; Shift edx left adding new bits from previous digit (edi) jmp End1Result End2Results: shl ecx , 5 add ecx , eax mov expo , ecx End1Result: mov result, edx popf } expo2 = expo; return result; } static inline int getExpo2(UINT x) { UINT result; _asm { bsr eax, x mov result, eax } return result; } // Special class to perform fast multiplication of _uint128 and UINT // no need to do any accumulation of "cross-multiplications" cause 2. operand // is only 32 bits class _uint128FastMul : public _uint128 { public: inline _uint128FastMul &operator=(const _uint128 &src) { HI64(*this) = HI64(src); LO64(*this) = LO64(src); return *this; } inline _uint128FastMul &operator*=(UINT k) { const _uint128FastMul copy(*this); _asm { mov edi, this lea esi, copy mov eax, dword ptr[esi] mul k mov dword ptr[edi], eax mov dword ptr[edi+4], edx mov eax, dword ptr[esi+4] mul k add dword ptr[edi+4], eax mov dword ptr[edi+8], edx adc dword ptr[edi+8], 0 mov eax, dword ptr[esi+8] mul k add dword ptr[edi+8], eax mov dword ptr[edi+12], edx adc dword ptr[edi+12], 0 mov eax, dword ptr[esi+12] mul k add dword ptr[edi+12], eax } return *this; } }; static void unsignedQuotRemainder(const _uint128 &a, const _uint128 &y, _uint128 *quot, _uint128 &rem) { _uint128FastMul p128; int lastShift = 0; rem = a; if(quot) *quot = 0; if(y < 0x8000) { int yExpo2 = getExpo2((UINT)y); const int yScale = 15 - yExpo2; const UINT y16 = (UINT)y << yScale; for(int count = 0; rem >= y16; count++) { int remExpo2; const UINT rem32 = getFirst32(rem, remExpo2); const int rem32Expo2 = getExpo2(rem32); UINT q32; int shift; p128 = y; if((shift = remExpo2 - rem32Expo2 + yScale) < 0) { // >= -31 q32 = (rem32 / (y16+1)) >> -shift; shift = 0; switch(q32) { case 0 : q32 = 1 ; break; case 1 : break; default: p128 *= q32; break; } } else { q32 = rem32 / (y16 + 1); switch(q32) { case 0 : q32 = 1 ; break; case 1 : break; default: p128 *= q32; break; } if(shift) int128shl(shift, &p128); } if(quot) { // do we want the quot. If its NULL there's no need to do this if(count) { if(lastShift > shift) { int128shl(lastShift - shift, quot); } __asm { mov eax, q32 mov esi, quot add dword ptr[esi], eax jnc AddDone1 adc dword ptr[esi+4] ,0 adc dword ptr[esi+8] ,0 adc dword ptr[esi+12],0 } } else { *quot = q32; } AddDone1: lastShift = shift; } rem -= p128; } // rem < 0xffff if(quot) { if(lastShift) { int128shl(lastShift, quot); } *quot += (UINT)rem / (UINT)y; } rem = (UINT)rem % (UINT)y; } else { int yExpo2; const UINT y16 = getFirst16(y, yExpo2); const int yScale = yExpo2 - getExpo2(y16); for(int count = 0; rem >= y; count++) { int remExpo2; const UINT rem32 = getFirst32(rem, remExpo2); const int rem32Expo2 = getExpo2(rem32); UINT q32; int shift; p128 = y; if((shift = remExpo2 - rem32Expo2 - yScale) < 0) { // >= -31 q32 = (rem32 / (y16+1)) >> -shift; shift = 0; switch(q32) { case 0 : q32 = 1 ; break; case 1 : break; default: p128 *= q32; break; } } else { q32 = rem32 / (y16 + 1); switch(q32) { case 0 : q32 = 1 ; break; case 1 : break; default: p128 *= q32; break; } if(shift) int128shl(shift, &p128); } if(quot) { // do we want the quot. If its NULL there's no need to do this if(count) { if(lastShift > shift) { int128shl(lastShift - shift, quot); } __asm { mov eax, q32 mov esi, quot add dword ptr[esi], eax jnc AddDone2 adc dword ptr[esi+4] ,0 adc dword ptr[esi+8] ,0 adc dword ptr[esi+12],0 } } else { *quot = q32; } AddDone2: lastShift = shift; } rem -= p128; } if(lastShift && quot) { int128shl(lastShift, quot); } } } static void signedQuotRemainder(const _int128 &a, const _int128 &b, _int128 *quot, _int128 &rem) { const bool aNegative = a.isNegative(); const bool bNegative = b.isNegative(); if(!aNegative && !bNegative) { unsignedQuotRemainder(*(const _uint128*)&a, *(const _uint128*)&b, (_uint128*)quot, (_uint128&)rem); } else { unsignedQuotRemainder((_uint128)(aNegative?-a:a), (_uint128)(bNegative?-b:b), (_uint128*)quot, (_uint128&)rem); if(quot && (aNegative != bNegative)) int128neg(quot); if(aNegative) int128neg(&rem); } } void int128div(void *dst, void *x) { const _int128 a(*(_int128*)dst); _int128 rem; signedQuotRemainder(a, *(const _int128*)x, (_int128*)dst, rem); } void int128rem(void *dst, void *x) { const _int128 a(*(_int128*)dst); signedQuotRemainder(a, *(const _int128*)x, NULL, *(_int128*)dst); } void uint128div(void *dst, const void *x) { const _uint128 a(*(_uint128*)dst); _uint128 rem; unsignedQuotRemainder(a, *(const _uint128*)x, (_uint128*)dst, rem); } void uint128rem(void *dst, const void *x) { const _uint128 a(*(_uint128*)dst); unsignedQuotRemainder(a, *(const _uint128*)x, NULL, *(_uint128*)dst); } void int128neg(void *x) { __asm { mov esi, x not dword ptr[esi ] not dword ptr[esi+ 4] not dword ptr[esi+ 8] not dword ptr[esi+12] add dword ptr[esi ], 1 adc dword ptr[esi+ 4], 0 adc dword ptr[esi+ 8], 0 adc dword ptr[esi+12], 0 } } int int128cmp(const void *x1, const void *x2) { int result; __asm { mov esi, x1 mov edi, x2 mov eax, dword ptr[esi+12] cmp eax, dword ptr[edi+12] jl lessthan ; signed compare of high int jg greaterthan mov eax, dword ptr[esi+8] cmp eax, dword ptr[edi+8] jb lessthan ; unsigned compare of the 3 others ja greaterthan mov eax, dword ptr[esi+4] cmp eax, dword ptr[edi+4] jb lessthan ja greaterthan mov eax, dword ptr[esi] ; cmp eax, dword ptr[edi] ; jb lessthan ja greaterthan mov result, 0 ; they are equal jmp End greaterthan: mov result, 1 jmp End lessThan: mov result, -1 } End: return result; } int uint128cmp(const void *x1, const void *x2) { int result; __asm { mov esi, x1 mov edi, x2 mov eax, dword ptr[esi+12] cmp eax, dword ptr[edi+12] jb lessthan ; unsigned compare of all integers ja greaterthan mov eax, dword ptr[esi+8] cmp eax, dword ptr[edi+8] jb lessthan ja greaterthan mov eax, dword ptr[esi+4] cmp eax, dword ptr[edi+4] jb lessthan ja greaterthan mov eax, dword ptr[esi] ; cmp eax, dword ptr[edi] ; jb lessthan ja greaterthan mov result, 0 ; they are equal jmp End greaterthan: mov result, 1 jmp End lessThan: mov result, -1 } End: return result; } #endif // _M_X64