Patrick Kelley 8fd444092b initial
2025-05-07 15:35:15 -04:00

728 lines
25 KiB
C++

// The code in this file is only used for x86 compilation.
// For x64 mode the file Int128x64.asm should be used instead.
// They execute much faster, because they use 64-bit registers,
// and division functions are coded entirely in assembler.
#ifndef _M_X64
#include "Int128.h"
typedef unsigned int UINT;
void int128add( void *dst, const void *x) {
__asm {
mov esi, x
mov edi, dst
mov eax, dword ptr[esi]
add dword ptr[edi], eax
mov eax, dword ptr[esi+4]
adc dword ptr[edi+4], eax
mov eax, dword ptr[esi+8]
adc dword ptr[edi+8], eax
mov eax, dword ptr[esi+12]
adc dword ptr[edi+12], eax
}
}
void int128sub(void *dst, const void *x) {
__asm {
mov esi, x
mov edi, dst
mov eax, dword ptr[esi]
sub dword ptr[edi], eax
mov eax, dword ptr[esi+4]
sbb dword ptr[edi+4], eax
mov eax, dword ptr[esi+8]
sbb dword ptr[edi+8], eax
mov eax, dword ptr[esi+12]
sbb dword ptr[edi+12], eax
}
}
void int128inc(void *x) {
__asm {
mov edi, x
add dword ptr[edi], 1
jnc Done
adc dword ptr[edi+4], 0
adc dword ptr[edi+8], 0
adc dword ptr[edi+12], 0
Done:
}
}
void int128dec(void *x) {
__asm {
mov edi, x
sub dword ptr[edi], 1
jnc Done
sbb dword ptr[edi+4], 0
sbb dword ptr[edi+8], 0
sbb dword ptr[edi+12], 0
Done:
}
}
#pragma warning(disable:4731)
void int128mul(void *dst, const void *x) {
_int128 *dp = (_int128*)dst;
const _int128 *b = (const _int128*)x;
if (!(dp->s4.i[1] || b->s4.i[1] || dp->s4.i[2] || b->s4.i[2] || dp->s4.i[3] || b->s4.i[3])) {
HI64(*dp) = 0;
LO64(*dp) = __int64(dp->s4.i[0]) * b->s4.i[0]; // simple _int64 multiplication. int32 * int32
}
else {
const _int128 a(*dp);
__asm {
push ebp
lea ebx, a // ebx = &a
mov ecx, b // ecx = b
mov ebp, dst // ebp = &dp (dst)
xor eax, eax
mov dword ptr[ebp+8 ], eax // dst[2..3] = 0
mov dword ptr[ebp+12], eax
mov eax, dword ptr[ebx] // 1. round
mul dword ptr[ecx] // [edx:eax] = x[0]*y[0]
mov dword ptr[ebp] , eax
mov dword ptr[ebp+4], edx // dst[0:1] = [edx:eax]
mov eax, dword ptr[ebx] // 2. round
mul dword ptr[ecx+4] // [edx:eax] = x[0]*y[1]
mov esi, eax //
mov edi, edx // [edi:esi] = [edx:eax]
mov eax, dword ptr[ebx+4]
mul dword ptr[ecx] // [edx:eax] = x[1]*y[0]
add esi, eax //
adc edi, edx // [edi:esi] += [edx:eax]
adc dword ptr[ebp+12],0 // may be extra carry for dst[3]
add dword ptr[ebp+4], esi
adc dword ptr[ebp+8], edi // dst[1:2] += [edi:esi]
mov eax, dword ptr[ebx] // 3. round. Extra carries are overflow
mul dword ptr[ecx+8] // [edx:eax] = x[0]*y[2]
mov esi, eax //
mov edi, edx // [edi:esi] = [edx:eax]
mov eax, dword ptr[ebx+4]
mul dword ptr[ecx+4] // [edx:eax] = x[1]*y[1]
add esi, eax //
adc edi, edx // [edi:esi] += [edx:eax]
mov eax, dword ptr[ebx+8]
mul dword ptr[ecx] // [edx:eax] = x[2]*y[0]
add esi, eax //
adc edi, edx // [edi:esi] += [edx:eax]
add dword ptr[ebp+8 ], esi
adc dword ptr[ebp+12], edi // dst[2:3] += [edi:esi]
mov eax, dword ptr[ebx] // 4. round. High end and carries are overflow
mul dword ptr[ecx+12] // [edx:eax] = x[0]*y[3]
mov esi, eax // esi = eax
mov eax, dword ptr[ebx+4] //
mul dword ptr[ecx+8] // [edx:eax] = x[1]*y[2]
add esi, eax // esi += eax
mov eax, dword ptr[ebx+8]
mul dword ptr[ecx+4] // [edx:eax] = x[2]*y[1]
add esi, eax // esi += eax
mov eax, dword ptr[ebx+12]
mul dword ptr[ecx] // [edx:eax] = x[3]*y[0]
add esi, eax // esi += eax
add dword ptr[ebp+12], esi // dst[3] += esi
pop ebp
}
}
}
void int128shl(int shft, void *x) {
__asm {
mov ecx, shft ; ecx = shift count
mov esi, x ; esi = x
cmp cl, 64
jae GE64
cmp cl, 32 ; cl < 64
jae _32_63
mov eax, dword ptr[esi+8] ; cl < 32
shld dword ptr[esi+12], eax, cl ; shift s4.i[3] adding bits from s4.i[2]
mov eax, dword ptr[esi+4]
shld dword ptr[esi+8], eax, cl ; shift s4.i[2] adding bits from s4.i[1]
mov eax, dword ptr[esi]
shld dword ptr[esi+4], eax, cl ; shift s4.i[1] adding bits from s4.i[0]
shl dword ptr[esi], cl ; shift s4.i[0]
jmp End
_32_63 : ; 32 <= cl < 64
and cl, 1Fh ; cl %= 32
mov eax, dword ptr[esi+8]
mov dword ptr[esi+12], eax ; s4.i[3] = s4.i[1]
mov eax, dword ptr[esi+4]
shld dword ptr[esi+12], eax, cl ; shift s4.i[3] adding bits eax
mov dword ptr[esi+8], eax ; s4.i[2] = s4.i[1]
mov eax, dword ptr[esi] ; eax = s4.i[0]
shld dword ptr[esi+8], eax, cl ; shift s4.i[2] adding bits eax
shl eax, cl
mov dword ptr[esi+4], eax ; shift s4.i[1] eax << cl (old i[0] << cl)
xor eax, eax
mov dword ptr[esi], eax ; s4.i[0] = 0
jmp End
GE64: ; cl >= 64
cmp cl, 96
jae GE96
and cl, 1Fh ; cl < 96; cl %= 32
mov eax, dword ptr[esi+4]
mov dword ptr[esi+12], eax ; s4.i[3] = s4.i[1]
mov eax, dword ptr[esi] ; eax = s4.i[0]
shld dword ptr[esi+12], eax, cl ; shift s4.i[3] adding bits eax
shl eax, cl
mov dword ptr[esi+8], eax ; shift s4.i[2] eax << cl
xor eax, eax
mov dword ptr[esi], eax ; s4.i[0] = 0
mov dword ptr[esi+4], eax ; s4.i[1] = 0
jmp End
GE96: ; cl >= 96
cmp cl, 128
jae GE128
and cl, 1Fh ; cl < 128; cl %= 32
mov eax, dword ptr[esi]
shl eax, cl
mov dword ptr[esi+12], eax ; s4.i[3] = s4.i[0] << cl
xor eax, eax
mov dword ptr[esi], eax ; s4.i[0] = 0
mov dword ptr[esi+4], eax ; s4.i[1] = 0
mov dword ptr[esi+8], eax ; s4.i[2] = 0
jmp End
GE128: ; cl >= 128
xor eax, eax ; return 0
mov dword ptr[esi ], eax
mov dword ptr[esi+4 ], eax
mov dword ptr[esi+8 ], eax
mov dword ptr[esi+12], eax
}
End:;
}
void int128shr(int shft, void *x) { // signed shift right
__asm {
mov ecx, shft ; ecx = shift count
mov esi, x ; esi = x
cmp cl, 64
jae GE64
cmp cl, 32 ; cl < 64
jae _32_63
mov eax, dword ptr[esi+4]
shrd dword ptr[esi], eax, cl ; shift s4[0] new bits from s4[1] (eax)
mov eax, dword ptr[esi+8]
shrd dword ptr[esi+4], eax, cl ; shift s4[1] new bits from s4[2] (eax)
mov eax, dword ptr[esi+12]
shrd dword ptr[esi+8], eax, cl ; shift s4[2] new bits from s4[3] (eax)
sar dword ptr[esi+12], cl ; shift s4[3]
jmp End
_32_63 : ; 32 <= cl < 64
and cl, 1Fh ; cl %= 32
mov eax, dword ptr[esi+4]
mov dword ptr[esi], eax ; s4[0] = eax (old s4[1])
mov eax, dword ptr[esi+8] ; eax = s4[2]
shrd dword ptr[esi], eax, cl ; shift s[0] new bits from s4[2] (eax)
mov dword ptr[esi+4], eax ; s4[1] = eax (old s4[2])
mov eax, dword ptr[esi+12] ; eax = s4[3]
shrd dword ptr[esi+4], eax, cl ; shift s4[1] new bits from eax (old s4[3])
mov dword ptr[esi+8], eax ; s4[2] = eax (old s4[3])
mov eax, dword ptr[esi+12] ; eax = s4[3]
sar eax, 1fh ; eax contain signbit in all bits
shrd dword ptr[esi+8], eax, cl ; shift s4[1] new bits from eax (sign of s4[3])
mov dword ptr[esi+12], eax ; s4[3] = eax
jmp End
GE64: ; cl >= 64
cmp cl, 96
jae GE96
and cl, 1Fh ; cl < 96; cl %= 32
mov eax, dword ptr[esi+8]
mov dword ptr[esi], eax ; s4[0] = s4[2]
mov eax, dword ptr[esi+12] ; eax = s4[3]
shrd dword ptr[esi], eax, cl ; shift s4[0] new bits from eax (old s4[3])
sar eax, cl ; eax contain sign bit in all bits
mov dword ptr[esi+4], eax ; s4[1] = eax (old s4[3])
sar eax, 1fh
mov dword ptr[esi+8], eax ; s4[2] = sign of s4[3]
mov dword ptr[esi+12], eax ; s4[3] = sign of s4[3]
jmp End
GE96: ; cl >= 96
cmp cl, 128
jae GE128
and cl, 1Fh ; cl < 128; cl %= 32
mov eax, dword ptr[esi+12]
sar eax, cl
mov dword ptr[esi], eax ; s4[0] = s4[3] >> cl (shift in signbit of s4[3])
sar eax, 1fh
mov dword ptr[esi+4 ], eax ; s4[1] = sign of s4[3]
mov dword ptr[esi+8 ], eax ; s4[2] = sign of s4[3]
mov dword ptr[esi+12], eax ; s4[3] = sign of s4[3]
jmp End
GE128: ; cl >= 128
mov eax, dword ptr[esi+12]
sar eax,1Fh
mov dword ptr[esi ], eax
mov dword ptr[esi+4 ], eax
mov dword ptr[esi+8 ], eax
mov dword ptr[esi+12], eax
}
End:;
}
void uint128shr(int shft, void *x) { // unsigned shift right
__asm {
mov ecx, shft ; ecx = shift count
mov esi, x
cmp cl, 64
jae GE64
cmp cl, 32 ; cl < 64
jae _32_63
mov eax, dword ptr[esi+4]
shrd dword ptr[esi], eax, cl ; shift s4[0] new bits from s4[1] (eax)
mov eax, dword ptr[esi+8]
shrd dword ptr[esi+4], eax, cl ; shift s4[1] new bits from s4[2] (eax)
mov eax, dword ptr[esi+12]
shrd dword ptr[esi+8], eax, cl ; shift s4[2] new bits from s4[3] (eax)
shr dword ptr[esi+12], cl ; shift s4[3]
jmp End
_32_63 : ; 32 <= cl < 64
and cl, 1Fh ; cl %= 32
mov eax, dword ptr[esi+4]
mov dword ptr[esi], eax ; s4[0] = eax (old s4[1])
mov eax, dword ptr[esi+8] ; eax = s4[2]
shrd dword ptr[esi], eax, cl ; shift s[0] new bits from s4[2] (eax)
mov dword ptr[esi+4], eax ; s4[1] = eax (old s4[2])
mov eax, dword ptr[esi+12] ; eax = s4[3]
shrd dword ptr[esi+4], eax, cl ; shift s4[1] new bits from eax (old s4[3])
shr eax, cl
mov dword ptr[esi+8], eax ; s4[2] = eax << cl
xor eax, eax
mov dword ptr[esi+12], eax ; s4[3] = 0
jmp End
GE64: ; cl >= 64
cmp cl, 96
jae GE96
and cl, 1Fh ; cl < 96; cl %= 32
mov eax, dword ptr[esi+8]
mov dword ptr[esi], eax ; s4[0] = s4[2]
mov eax, dword ptr[esi+12] ; eax = s4[3]
shrd dword ptr[esi], eax, cl ; shift s4[0] new bits from eax (old s4[3])
shr eax, cl
mov dword ptr[esi+4], eax ; s4[1] = eax >> cl (old s4[3] >> cl)
xor eax, eax
mov dword ptr[esi+8], eax ; s4[2] = 0
mov dword ptr[esi+12], eax ; s4[3] = 0
jmp End
GE96: ; cl >= 96
cmp cl, 128
jae GE128
and cl, 1Fh ; cl < 128; cl %= 32
mov eax, dword ptr[esi+12]
shr eax, cl
mov dword ptr[esi], eax ; s4[0] = s4[3] >> cl
xor eax, eax
mov dword ptr[esi+4 ], eax ; s4[1] = 0
mov dword ptr[esi+8 ], eax ; s4[2] = 0
mov dword ptr[esi+12], eax ; s4[3] = 0
jmp End
GE128: ; cl >= 128
xor eax,eax ; return 0
mov dword ptr[esi ], eax
mov dword ptr[esi+4 ], eax
mov dword ptr[esi+8 ], eax
mov dword ptr[esi+12], eax
}
End:;
}
static UINT getFirst16(const _uint128 &n, int &expo2) {
UINT result, expo;
__asm {
pushf
mov ecx , 4
mov edi , n
add edi , 12
xor eax , eax
std
repe scasd
jnz SearchBit
xor edx , edx
jmp End1Result
SearchBit: ; assume ecx hold the index of integer with first 1-bit
mov edx , dword ptr [edi+4]
bsr eax , edx ; eax holds index of highest 1 bit
cmp eax , 15
je End2Results
jg TooManyBits
test ecx , ecx ; Get some bits from previous, if we got some
je End2Results
shl ecx , 5
add ecx , eax
mov expo, ecx
mov edi , dword ptr[edi] ; edi = previous int
mov ecx , 15
sub ecx , eax ; ecx = 15 - index of higest 1-bit
shld edx , edi, cl ; Shift edx left adding new bits from previous digit (edi)
jmp End1Result
TooManyBits: ; assume eax = index of higest 1-bit in edx (>15)
shl ecx , 5
add ecx , eax
mov expo, ecx
mov ecx , eax
sub ecx , 15
shr edx , cl
jmp End1Result
End2Results:
shl ecx , 5
add ecx , eax
mov expo, ecx
End1Result:
mov result, edx
popf
}
expo2 = expo;
return result;
}
static UINT getFirst32(const _uint128 &n, int &expo2) {
UINT result, expo;
__asm {
pushf
mov ecx, 4
mov edi, n
add edi, 12
xor eax, eax
std
repe scasd
jnz SearchBit
xor edx, edx
jmp End1Result
SearchBit: ; assume ecx hold the index of integer with first 1-bit
mov edx, dword ptr[edi+4]
bsr eax, edx ; eax holds index of highest 1 bit
cmp eax, 31
je End2Results
test ecx, ecx ; Get some bits from next, if we got some
je End2Results
shl ecx, 5
add ecx, eax
mov expo, ecx
mov edi, dword ptr[edi] ; edi = previous int
mov ecx, 31
sub ecx, eax ; ecx = 31 - bits already in edx
shld edx, edi, cl ; Shift edx left adding new bits from previous digit (edi)
jmp End1Result
End2Results:
shl ecx , 5
add ecx , eax
mov expo , ecx
End1Result:
mov result, edx
popf
}
expo2 = expo;
return result;
}
static inline int getExpo2(UINT x) {
UINT result;
_asm {
bsr eax, x
mov result, eax
}
return result;
}
// Special class to perform fast multiplication of _uint128 and UINT
// no need to do any accumulation of "cross-multiplications" cause 2. operand
// is only 32 bits
class _uint128FastMul : public _uint128 {
public:
inline _uint128FastMul &operator=(const _uint128 &src) {
HI64(*this) = HI64(src); LO64(*this) = LO64(src);
return *this;
}
inline _uint128FastMul &operator*=(UINT k) {
const _uint128FastMul copy(*this);
_asm {
mov edi, this
lea esi, copy
mov eax, dword ptr[esi]
mul k
mov dword ptr[edi], eax
mov dword ptr[edi+4], edx
mov eax, dword ptr[esi+4]
mul k
add dword ptr[edi+4], eax
mov dword ptr[edi+8], edx
adc dword ptr[edi+8], 0
mov eax, dword ptr[esi+8]
mul k
add dword ptr[edi+8], eax
mov dword ptr[edi+12], edx
adc dword ptr[edi+12], 0
mov eax, dword ptr[esi+12]
mul k
add dword ptr[edi+12], eax
}
return *this;
}
};
static void unsignedQuotRemainder(const _uint128 &a, const _uint128 &y, _uint128 *quot, _uint128 &rem) {
_uint128FastMul p128;
int lastShift = 0;
rem = a;
if(quot) *quot = 0;
if(y < 0x8000) {
int yExpo2 = getExpo2((UINT)y);
const int yScale = 15 - yExpo2;
const UINT y16 = (UINT)y << yScale;
for(int count = 0; rem >= y16; count++) {
int remExpo2;
const UINT rem32 = getFirst32(rem, remExpo2);
const int rem32Expo2 = getExpo2(rem32);
UINT q32;
int shift;
p128 = y;
if((shift = remExpo2 - rem32Expo2 + yScale) < 0) { // >= -31
q32 = (rem32 / (y16+1)) >> -shift;
shift = 0;
switch(q32) {
case 0 : q32 = 1 ; break;
case 1 : break;
default: p128 *= q32; break;
}
} else {
q32 = rem32 / (y16 + 1);
switch(q32) {
case 0 : q32 = 1 ; break;
case 1 : break;
default: p128 *= q32; break;
}
if(shift) int128shl(shift, &p128);
}
if(quot) { // do we want the quot. If its NULL there's no need to do this
if(count) {
if(lastShift > shift) {
int128shl(lastShift - shift, quot);
}
__asm {
mov eax, q32
mov esi, quot
add dword ptr[esi], eax
jnc AddDone1
adc dword ptr[esi+4] ,0
adc dword ptr[esi+8] ,0
adc dword ptr[esi+12],0
}
} else {
*quot = q32;
}
AddDone1:
lastShift = shift;
}
rem -= p128;
}
// rem < 0xffff
if(quot) {
if(lastShift) {
int128shl(lastShift, quot);
}
*quot += (UINT)rem / (UINT)y;
}
rem = (UINT)rem % (UINT)y;
} else {
int yExpo2;
const UINT y16 = getFirst16(y, yExpo2);
const int yScale = yExpo2 - getExpo2(y16);
for(int count = 0; rem >= y; count++) {
int remExpo2;
const UINT rem32 = getFirst32(rem, remExpo2);
const int rem32Expo2 = getExpo2(rem32);
UINT q32;
int shift;
p128 = y;
if((shift = remExpo2 - rem32Expo2 - yScale) < 0) { // >= -31
q32 = (rem32 / (y16+1)) >> -shift;
shift = 0;
switch(q32) {
case 0 : q32 = 1 ; break;
case 1 : break;
default: p128 *= q32; break;
}
} else {
q32 = rem32 / (y16 + 1);
switch(q32) {
case 0 : q32 = 1 ; break;
case 1 : break;
default: p128 *= q32; break;
}
if(shift) int128shl(shift, &p128);
}
if(quot) { // do we want the quot. If its NULL there's no need to do this
if(count) {
if(lastShift > shift) {
int128shl(lastShift - shift, quot);
}
__asm {
mov eax, q32
mov esi, quot
add dword ptr[esi], eax
jnc AddDone2
adc dword ptr[esi+4] ,0
adc dword ptr[esi+8] ,0
adc dword ptr[esi+12],0
}
} else {
*quot = q32;
}
AddDone2:
lastShift = shift;
}
rem -= p128;
}
if(lastShift && quot) {
int128shl(lastShift, quot);
}
}
}
static void signedQuotRemainder(const _int128 &a, const _int128 &b, _int128 *quot, _int128 &rem) {
const bool aNegative = a.isNegative();
const bool bNegative = b.isNegative();
if(!aNegative && !bNegative) {
unsignedQuotRemainder(*(const _uint128*)&a, *(const _uint128*)&b, (_uint128*)quot, (_uint128&)rem);
} else {
unsignedQuotRemainder((_uint128)(aNegative?-a:a), (_uint128)(bNegative?-b:b), (_uint128*)quot, (_uint128&)rem);
if(quot && (aNegative != bNegative)) int128neg(quot);
if(aNegative) int128neg(&rem);
}
}
void int128div(void *dst, void *x) {
const _int128 a(*(_int128*)dst);
_int128 rem;
signedQuotRemainder(a, *(const _int128*)x, (_int128*)dst, rem);
}
void int128rem(void *dst, void *x) {
const _int128 a(*(_int128*)dst);
signedQuotRemainder(a, *(const _int128*)x, NULL, *(_int128*)dst);
}
void uint128div(void *dst, const void *x) {
const _uint128 a(*(_uint128*)dst);
_uint128 rem;
unsignedQuotRemainder(a, *(const _uint128*)x, (_uint128*)dst, rem);
}
void uint128rem(void *dst, const void *x) {
const _uint128 a(*(_uint128*)dst);
unsignedQuotRemainder(a, *(const _uint128*)x, NULL, *(_uint128*)dst);
}
void int128neg(void *x) {
__asm {
mov esi, x
not dword ptr[esi ]
not dword ptr[esi+ 4]
not dword ptr[esi+ 8]
not dword ptr[esi+12]
add dword ptr[esi ], 1
adc dword ptr[esi+ 4], 0
adc dword ptr[esi+ 8], 0
adc dword ptr[esi+12], 0
}
}
int int128cmp(const void *x1, const void *x2) {
int result;
__asm {
mov esi, x1
mov edi, x2
mov eax, dword ptr[esi+12]
cmp eax, dword ptr[edi+12]
jl lessthan ; signed compare of high int
jg greaterthan
mov eax, dword ptr[esi+8]
cmp eax, dword ptr[edi+8]
jb lessthan ; unsigned compare of the 3 others
ja greaterthan
mov eax, dword ptr[esi+4]
cmp eax, dword ptr[edi+4]
jb lessthan
ja greaterthan
mov eax, dword ptr[esi] ;
cmp eax, dword ptr[edi] ;
jb lessthan
ja greaterthan
mov result, 0 ; they are equal
jmp End
greaterthan:
mov result, 1
jmp End
lessThan:
mov result, -1
}
End:
return result;
}
int uint128cmp(const void *x1, const void *x2) {
int result;
__asm {
mov esi, x1
mov edi, x2
mov eax, dword ptr[esi+12]
cmp eax, dword ptr[edi+12]
jb lessthan ; unsigned compare of all integers
ja greaterthan
mov eax, dword ptr[esi+8]
cmp eax, dword ptr[edi+8]
jb lessthan
ja greaterthan
mov eax, dword ptr[esi+4]
cmp eax, dword ptr[edi+4]
jb lessthan
ja greaterthan
mov eax, dword ptr[esi] ;
cmp eax, dword ptr[edi] ;
jb lessthan
ja greaterthan
mov result, 0 ; they are equal
jmp End
greaterthan:
mov result, 1
jmp End
lessThan:
mov result, -1
}
End:
return result;
}
#endif // _M_X64