aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86/p6
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/x86/p6')
-rw-r--r--gmp-6.3.0/mpn/x86/p6/README125
-rw-r--r--gmp-6.3.0/mpn/x86/p6/aors_n.asm156
-rw-r--r--gmp-6.3.0/mpn/x86/p6/aorsmul_1.asm320
-rw-r--r--gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm287
-rw-r--r--gmp-6.3.0/mpn/x86/p6/copyd.asm178
-rw-r--r--gmp-6.3.0/mpn/x86/p6/dive_1.asm267
-rw-r--r--gmp-6.3.0/mpn/x86/p6/gcd_11.asm83
-rw-r--r--gmp-6.3.0/mpn/x86/p6/gmp-mparam.h194
-rw-r--r--gmp-6.3.0/mpn/x86/p6/lshsub_n.asm169
-rw-r--r--gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm767
-rw-r--r--gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h218
-rw-r--r--gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm38
-rw-r--r--gmp-6.3.0/mpn/x86/p6/mmx/popham.asm39
-rw-r--r--gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm38
-rw-r--r--gmp-6.3.0/mpn/x86/p6/mod_34lsub1.asm190
-rw-r--r--gmp-6.3.0/mpn/x86/p6/mode1o.asm170
-rw-r--r--gmp-6.3.0/mpn/x86/p6/mul_basecase.asm607
-rw-r--r--gmp-6.3.0/mpn/x86/p6/p3mmx/popham.asm42
-rw-r--r--gmp-6.3.0/mpn/x86/p6/sqr_basecase.asm649
-rw-r--r--gmp-6.3.0/mpn/x86/p6/sse2/addmul_1.asm37
-rw-r--r--gmp-6.3.0/mpn/x86/p6/sse2/gmp-mparam.h200
-rw-r--r--gmp-6.3.0/mpn/x86/p6/sse2/mod_1_1.asm34
-rw-r--r--gmp-6.3.0/mpn/x86/p6/sse2/mod_1_4.asm34
-rw-r--r--gmp-6.3.0/mpn/x86/p6/sse2/mul_1.asm38
-rw-r--r--gmp-6.3.0/mpn/x86/p6/sse2/mul_basecase.asm35
-rw-r--r--gmp-6.3.0/mpn/x86/p6/sse2/popcount.asm35
-rw-r--r--gmp-6.3.0/mpn/x86/p6/sse2/sqr_basecase.asm35
-rw-r--r--gmp-6.3.0/mpn/x86/p6/sse2/submul_1.asm35
28 files changed, 5020 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86/p6/README b/gmp-6.3.0/mpn/x86/p6/README
new file mode 100644
index 0000000..f19d47b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/README
@@ -0,0 +1,125 @@
+Copyright 2000, 2001 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/.
+
+
+
+
+
+ INTEL P6 MPN SUBROUTINES
+
+
+
+This directory contains code optimized for Intel P6 class CPUs, meaning
+PentiumPro, Pentium II and Pentium III. The mmx and p3mmx subdirectories
+have routines using MMX instructions.
+
+
+
+STATUS
+
+Times for the loops, with all code and data in L1 cache, are as follows.
+Some of these might be able to be improved.
+
+ cycles/limb
+
+ mpn_add_n/sub_n 3.7
+
+ mpn_copyi 0.75
+ mpn_copyd 1.75 (or 0.75 if no overlap)
+
+ mpn_divrem_1 39.0
+ mpn_mod_1 21.5
+ mpn_divexact_by3 8.5
+
+ mpn_mul_1 5.5
+ mpn_addmul/submul_1 6.35
+
+ mpn_l/rshift 2.5
+
+ mpn_mul_basecase 8.2 cycles/crossproduct (approx)
+ mpn_sqr_basecase 4.0 cycles/crossproduct (approx)
+ or 7.75 cycles/triangleproduct (approx)
+
+Pentium II and III have MMX and get the following improvements.
+
+ mpn_divrem_1 25.0 integer part, 17.5 fractional part
+
+ mpn_l/rshift 1.75
+
+
+
+
+NOTES
+
+Write-allocate L1 data cache means prefetching of destinations is unnecessary.
+
+Mispredicted branches have a penalty of between 9 and 15 cycles, and even up
+to 26 cycles depending how far speculative execution has gone. The 9 cycle
+minimum penalty comes from the issue pipeline being 9 stages.
+
+A copy with rep movs seems to copy 16 bytes at a time, since speeds for 4,
+5, 6 or 7 limb operations are all the same. The 0.75 cycles/limb would be 3
+cycles per 16 byte block.
+
+
+
+
+CODING
+
+Instructions in general code have been shown grouped if they can execute
+together, which means up to three instructions with no successive
+dependencies, and with only the first being a multiple micro-op.
+
+P6 has out-of-order execution, so the groupings are really only showing
+dependent paths where some shuffling might allow some latencies to be
+hidden.
+
+
+
+
+REFERENCES
+
+"Intel Architecture Optimization Reference Manual", 1999, revision 001 dated
+02/99, order number 245127 (order number 730795-001 is in the document too).
+Available on-line:
+
+ http://download.intel.com/design/PentiumII/manuals/245127.htm
+
+"Intel Architecture Optimization Manual", 1997, order number 242816. This
+is an older document mostly about P5 and not as good as the above.
+Available on-line:
+
+ http://download.intel.com/design/PentiumII/manuals/242816.htm
+
+
+
+----------------
+Local variables:
+mode: text
+fill-column: 76
+End:
diff --git a/gmp-6.3.0/mpn/x86/p6/aors_n.asm b/gmp-6.3.0/mpn/x86/p6/aors_n.asm
new file mode 100644
index 0000000..df51c2e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/aors_n.asm
@@ -0,0 +1,156 @@
+dnl Intel P6 mpn_add_n/mpn_sub_n -- mpn add or subtract.
+
+dnl Copyright 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C * Avoid indexed addressing, it makes us stall on the two-ported register
+C file.
+
+C cycles/limb
+C P6 model 0-8,10-12 3.17
+C P6 model 9 (Banias) 2.15
+C P6 model 13 (Dothan) 2.25
+
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebx')
+define(`n', `%ecx')
+
+ifdef(`OPERATION_add_n', `
+ define(ADCSBB, adc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(func)
+ xor %edx, %edx
+L(start):
+ push %edi
+ push %esi
+ push %ebx
+
+ mov 16(%esp), rp
+ mov 20(%esp), up
+ mov 24(%esp), vp
+ mov 28(%esp), n
+
+ lea (up,n,4), up
+ lea (vp,n,4), vp
+ lea (rp,n,4), rp
+
+ neg n
+ mov n, %eax
+ and $-8, n
+ and $7, %eax
+ shl $2, %eax C 4x
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ lea L(ent) (%eax,%eax,2), %eax C 12x
+')
+
+ shr %edx C set cy flag
+ jmp *%eax
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ lea (%eax,%eax,2), %eax
+ add $L(ent)-L(here), %eax
+ add (%esp), %eax
+ ret_internal
+')
+
+L(end):
+ sbb %eax, %eax
+ neg %eax
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+
+ ALIGN(16)
+L(top):
+ jecxz L(end)
+L(ent):
+Zdisp( mov, 0,(up,n,4), %eax)
+Zdisp( ADCSBB, 0,(vp,n,4), %eax)
+Zdisp( mov, %eax, 0,(rp,n,4))
+
+ mov 4(up,n,4), %edx
+ ADCSBB 4(vp,n,4), %edx
+ mov %edx, 4(rp,n,4)
+
+ mov 8(up,n,4), %eax
+ ADCSBB 8(vp,n,4), %eax
+ mov %eax, 8(rp,n,4)
+
+ mov 12(up,n,4), %edx
+ ADCSBB 12(vp,n,4), %edx
+ mov %edx, 12(rp,n,4)
+
+ mov 16(up,n,4), %eax
+ ADCSBB 16(vp,n,4), %eax
+ mov %eax, 16(rp,n,4)
+
+ mov 20(up,n,4), %edx
+ ADCSBB 20(vp,n,4), %edx
+ mov %edx, 20(rp,n,4)
+
+ mov 24(up,n,4), %eax
+ ADCSBB 24(vp,n,4), %eax
+ mov %eax, 24(rp,n,4)
+
+ mov 28(up,n,4), %edx
+ ADCSBB 28(vp,n,4), %edx
+ mov %edx, 28(rp,n,4)
+
+ lea 8(n), n
+ jmp L(top)
+
+EPILOGUE()
+
+PROLOGUE(func_nc)
+ movl 20(%esp), %edx
+ jmp L(start)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/p6/aorsmul_1.asm b/gmp-6.3.0/mpn/x86/p6/aorsmul_1.asm
new file mode 100644
index 0000000..bc8c49c
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/aorsmul_1.asm
@@ -0,0 +1,320 @@
+dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+
+dnl Copyright 1999-2002, 2005 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P5
+C P6 model 0-8,10-12 6.44
+C P6 model 9 (Banias) 6.15
+C P6 model 13 (Dothan) 6.11
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C AMD K6
+C AMD K7
+C AMD K8
+
+
+dnl P6 UNROLL_COUNT cycles/limb
+dnl 8 6.7
+dnl 16 6.35
+dnl 32 6.3
+dnl 64 6.3
+dnl Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1', `
+ define(M4_inst, addl)
+ define(M4_function_1, mpn_addmul_1)
+ define(M4_function_1c, mpn_addmul_1c)
+ define(M4_description, add it to)
+ define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1', `
+ define(M4_inst, subl)
+ define(M4_function_1, mpn_submul_1)
+ define(M4_function_1c, mpn_submul_1c)
+ define(M4_description, subtract it from)
+ define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+C
+C This code is pretty much the same as the K6 code. The unrolled loop is
+C the same, but there's just a few scheduling tweaks in the setups and the
+C simple loop.
+C
+C A number of variations have been tried for the unrolled loop, with one or
+C two carries, and with loads scheduled earlier, but nothing faster than 6
+C cycles/limb has been found.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(M4_function_1c)
+ pushl %ebx
+deflit(`FRAME',4)
+ movl PARAM_CARRY, %ebx
+ jmp L(start_nc)
+EPILOGUE()
+
+PROLOGUE(M4_function_1)
+ push %ebx
+deflit(`FRAME',4)
+ xorl %ebx, %ebx C initial carry
+
+L(start_nc):
+ movl PARAM_SIZE, %ecx
+ pushl %esi
+deflit(`FRAME',8)
+
+ movl PARAM_SRC, %esi
+ pushl %edi
+deflit(`FRAME',12)
+
+ movl PARAM_DST, %edi
+ pushl %ebp
+deflit(`FRAME',16)
+ cmpl $UNROLL_THRESHOLD, %ecx
+
+ movl PARAM_MULTIPLIER, %ebp
+ jae L(unroll)
+
+
+ C simple loop
+ C this is offset 0x22, so close enough to aligned
+L(simple):
+ C eax scratch
+ C ebx carry
+ C ecx counter
+ C edx scratch
+ C esi src
+ C edi dst
+ C ebp multiplier
+
+ movl (%esi), %eax
+ addl $4, %edi
+
+ mull %ebp
+
+ addl %ebx, %eax
+ adcl $0, %edx
+
+ M4_inst %eax, -4(%edi)
+ movl %edx, %ebx
+
+ adcl $0, %ebx
+ decl %ecx
+
+ leal 4(%esi), %esi
+ jnz L(simple)
+
+
+ popl %ebp
+ popl %edi
+
+ popl %esi
+ movl %ebx, %eax
+
+ popl %ebx
+ ret
+
+
+
+C------------------------------------------------------------------------------
+C VAR_JUMP holds the computed jump temporarily because there's not enough
+C registers when doing the mul for the initial two carry limbs.
+C
+C The add/adc for the initial carry in %ebx is necessary only for the
+C mpn_add/submul_1c entry points. Duplicating the startup code to
+C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
+C idea.
+
+dnl overlapping with parameters already fetched
+define(VAR_COUNTER,`PARAM_SIZE')
+define(VAR_JUMP, `PARAM_DST')
+
+ C this is offset 0x43, so close enough to aligned
+L(unroll):
+ C eax
+ C ebx initial carry
+ C ecx size
+ C edx
+ C esi src
+ C edi dst
+ C ebp
+
+ movl %ecx, %edx
+ decl %ecx
+
+ subl $2, %edx
+ negl %ecx
+
+ shrl $UNROLL_LOG2, %edx
+ andl $UNROLL_MASK, %ecx
+
+ movl %edx, VAR_COUNTER
+ movl %ecx, %edx
+
+ C 15 code bytes per limb
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ shll $4, %edx
+ negl %ecx
+
+ leal L(entry) (%edx,%ecx,1), %edx
+')
+ movl (%esi), %eax C src low limb
+
+ movl %edx, VAR_JUMP
+ leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
+
+ mull %ebp
+
+ addl %ebx, %eax C initial carry (from _1c)
+ adcl $0, %edx
+
+ movl %edx, %ebx C high carry
+ leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
+
+ movl VAR_JUMP, %edx
+ testl $1, %ecx
+ movl %eax, %ecx C low carry
+
+ cmovnz( %ebx, %ecx) C high,low carry other way around
+ cmovnz( %eax, %ebx)
+
+ jmp *%edx
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ shll $4, %edx
+ negl %ecx
+
+ C See mpn/x86/README about old gas bugs
+ leal (%edx,%ecx,1), %edx
+ addl $L(entry)-L(here), %edx
+
+ addl (%esp), %edx
+
+ ret_internal
+')
+
+
+C -----------------------------------------------------------
+ ALIGN(32)
+L(top):
+deflit(`FRAME',16)
+ C eax scratch
+ C ebx carry hi
+ C ecx carry lo
+ C edx scratch
+ C esi src
+ C edi dst
+ C ebp multiplier
+ C
+ C VAR_COUNTER loop counter
+ C
+ C 15 code bytes per limb
+
+ addl $UNROLL_BYTES, %edi
+
+L(entry):
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 + 4))
+
+Zdisp( movl, disp0,(%esi), %eax)
+ mull %ebp
+Zdisp( M4_inst,%ecx, disp0,(%edi))
+ adcl %eax, %ebx
+ movl %edx, %ecx
+ adcl $0, %ecx
+
+ movl disp1(%esi), %eax
+ mull %ebp
+ M4_inst %ebx, disp1(%edi)
+ adcl %eax, %ecx
+ movl %edx, %ebx
+ adcl $0, %ebx
+')
+
+ decl VAR_COUNTER
+ leal UNROLL_BYTES(%esi), %esi
+
+ jns L(top)
+
+
+deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
+
+ M4_inst %ecx, disp0(%edi)
+ movl %ebx, %eax
+
+ popl %ebp
+ popl %edi
+
+ popl %esi
+ popl %ebx
+ adcl $0, %eax
+
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm b/gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm
new file mode 100644
index 0000000..a0a9d90
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm
@@ -0,0 +1,287 @@
+dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl Rearranged from mpn/x86/p6/dive_1.asm by Marco Bodrato.
+
+dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C odd even divisor
+C P6: 10.0 12.0 cycles/limb
+
+C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+
+C The odd case is basically the same as mpn_modexact_1_odd, just with an
+C extra store, and it runs at the same 10 cycles which is the dependent
+C chain.
+C
+C The shifts for the even case aren't on the dependent chain so in principle
+C it could run the same too, but nothing running at 10 has been found.
+C Perhaps there's too many uops (an extra 4 over the odd case).
+
+defframe(PARAM_SHIFT, 24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(STACK_SPACE, 16)
+
+dnl re-use parameter space
+define(VAR_INVERSE,`PARAM_SRC')
+
+ TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse, int shift)
+
+ ALIGN(16)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_INVERSE, %ebp
+
+ movl PARAM_SHIFT, %ecx C trailing twos
+
+L(common):
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ leal (%esi,%ebx,4), %esi C src end
+
+ leal (%edi,%ebx,4), %edi C dst end
+ negl %ebx C -size
+
+ movl (%esi,%ebx,4), %eax C src[0]
+
+ orl %ecx, %ecx
+ jz L(odd_entry)
+
+ movl %edi, PARAM_DST
+ movl %ebp, VAR_INVERSE
+
+L(even):
+ C eax src[0]
+ C ebx counter, limbs, negative
+ C ecx shift
+ C edx
+ C esi
+ C edi
+ C ebp
+
+ xorl %ebp, %ebp C initial carry bit
+ xorl %edx, %edx C initial carry limb (for size==1)
+
+ incl %ebx
+ jz L(even_one)
+
+ movl (%esi,%ebx,4), %edi C src[1]
+
+ shrdl( %cl, %edi, %eax)
+
+ jmp L(even_entry)
+
+
+L(even_top):
+ C eax scratch
+ C ebx counter, limbs, negative
+ C ecx shift
+ C edx scratch
+ C esi &src[size]
+ C edi &dst[size] and scratch
+ C ebp carry bit
+
+ movl (%esi,%ebx,4), %edi
+
+ mull PARAM_DIVISOR
+
+ movl -4(%esi,%ebx,4), %eax
+ shrdl( %cl, %edi, %eax)
+
+ subl %ebp, %eax
+
+ sbbl %ebp, %ebp
+ subl %edx, %eax
+
+ sbbl $0, %ebp
+
+L(even_entry):
+ imull VAR_INVERSE, %eax
+
+ movl PARAM_DST, %edi
+ negl %ebp
+
+ movl %eax, -4(%edi,%ebx,4)
+ incl %ebx
+ jnz L(even_top)
+
+ mull PARAM_DIVISOR
+
+ movl -4(%esi), %eax
+
+L(even_one):
+ shrl %cl, %eax
+ movl SAVE_ESI, %esi
+
+ subl %ebp, %eax
+ movl SAVE_EBP, %ebp
+
+ subl %edx, %eax
+ movl SAVE_EBX, %ebx
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi)
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+C The dependent chain here is
+C
+C subl %edx, %eax 1
+C imull %ebp, %eax 4
+C mull PARAM_DIVISOR 5
+C ----
+C total 10
+C
+C and this is the measured speed. No special scheduling is necessary, out
+C of order execution hides the load latency.
+
+L(odd_top):
+ C eax scratch (src limb)
+ C ebx counter, limbs, negative
+ C ecx carry bit
+ C edx carry limb, high of last product
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp inverse
+
+ mull PARAM_DIVISOR
+
+ movl (%esi,%ebx,4), %eax
+ subl %ecx, %eax
+
+ sbbl %ecx, %ecx
+ subl %edx, %eax
+
+ sbbl $0, %ecx
+
+L(odd_entry):
+ imull %ebp, %eax
+
+ movl %eax, (%edi,%ebx,4)
+ negl %ecx
+
+ incl %ebx
+ jnz L(odd_top)
+
+
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBP, %ebp
+
+ movl SAVE_EBX, %ebx
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
+
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+
+ ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+ bsfl %eax, %ecx C trailing twos
+
+ movl %ebp, SAVE_EBP
+
+ shrl %cl, %eax C d without twos
+
+ movl %eax, %edx
+ shrl %eax C d/2 without twos
+
+ movl %edx, PARAM_DIVISOR
+ andl $127, %eax
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %ebp)
+ movzbl (%eax,%ebp), %ebp C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %ebp C inv 8 bits
+')
+
+ leal (%ebp,%ebp), %eax C 2*inv
+
+ imull %ebp, %ebp C inv*inv
+ imull %edx, %ebp C inv*inv*d
+
+ subl %ebp, %eax C inv = 2*inv - inv*inv*d
+ leal (%eax,%eax), %ebp C 2*inv
+
+ imull %eax, %eax C inv*inv
+ imull %edx, %eax C inv*inv*d
+
+ subl %eax, %ebp C inv = 2*inv - inv*inv*d
+
+ jmp L(common)
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86/p6/copyd.asm b/gmp-6.3.0/mpn/x86/p6/copyd.asm
new file mode 100644
index 0000000..1be7636
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/copyd.asm
@@ -0,0 +1,178 @@
+dnl Intel P6 mpn_copyd -- copy limb vector backwards.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: 1.75 cycles/limb, or 0.75 if no overlap
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C An explicit loop is used because a decrementing rep movsl is a bit slow at
+C 2.4 c/l. That rep movsl also has about a 40 cycle startup time, and the
+C code here stands a chance of being faster if the branches predict well.
+C
+C The slightly strange loop form seems necessary for the claimed speed.
+C Maybe load/store ordering affects it.
+C
+C The source and destination are checked to see if they're actually
+C overlapping, since it might be possible to use an incrementing rep movsl
+C at 0.75 c/l. (It doesn't suffer the bad startup time of the decrementing
+C version.)
+C
+C Enhancements:
+C
+C Top speed for an all-integer copy is probably 1.0 c/l, being one load and
+C one store each cycle. Unrolling the loop below would approach 1.0, but
+C it'd be good to know why something like store/load/subl + store/load/jnz
+C doesn't already run at 1.0 c/l. It looks like it should decode in 2
+C cycles, but doesn't run that way.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-using parameter space
+define(SAVE_ESI,`PARAM_SIZE')
+define(SAVE_EDI,`PARAM_SRC')
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ subl $1, %ecx
+ jb L(zero)
+
+ movl (%esi,%ecx,4), %eax C src[size-1]
+ jz L(one)
+
+ movl -4(%esi,%ecx,4), %edx C src[size-2]
+ subl $2, %ecx
+ jbe L(done_loop) C 2 or 3 limbs only
+
+
+ C The usual overlap is
+ C
+ C high low
+ C +------------------+
+ C | dst|
+ C +------------------+
+ C +------------------+
+ C | src|
+ C +------------------+
+ C
+ C We can use an incrementing copy in the following circumstances.
+ C
+ C src+4*size<=dst, since then the regions are disjoint
+ C
+ C src==dst, clearly (though this shouldn't occur normally)
+ C
+ C src>dst, since in that case it's a requirement of the
+ C parameters that src>=dst+size*4, and hence the
+ C regions are disjoint
+ C
+
+ leal (%edi,%ecx,4), %edx
+ cmpl %edi, %esi
+ jae L(use_movsl) C src >= dst
+
+ cmpl %edi, %edx
+ movl 4(%esi,%ecx,4), %edx C src[size-2] again
+ jbe L(use_movsl) C src+4*size <= dst
+
+
+L(top):
+ C eax prev high limb
+ C ebx
+ C ecx counter, size-3 down to 0 or -1, inclusive, by 2s
+ C edx prev low limb
+ C esi src
+ C edi dst
+ C ebp
+
+ movl %eax, 8(%edi,%ecx,4)
+ movl (%esi,%ecx,4), %eax
+
+ movl %edx, 4(%edi,%ecx,4)
+ movl -4(%esi,%ecx,4), %edx
+
+ subl $2, %ecx
+ jnbe L(top)
+
+
+L(done_loop):
+ movl %eax, 8(%edi,%ecx,4)
+ movl %edx, 4(%edi,%ecx,4)
+
+ C copy low limb (needed if size was odd, but will already have been
+ C done in the loop if size was even)
+ movl (%esi), %eax
+L(one):
+ movl %eax, (%edi)
+ movl SAVE_EDI, %edi
+ movl SAVE_ESI, %esi
+
+ ret
+
+
+L(use_movsl):
+ C eax
+ C ebx
+ C ecx size-3
+ C edx
+ C esi src
+ C edi dst
+ C ebp
+
+ addl $3, %ecx
+
+ cld C better safe than sorry, see mpn/x86/README
+
+ rep
+ movsl
+
+L(zero):
+ movl SAVE_ESI, %esi
+ movl SAVE_EDI, %edi
+
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/p6/dive_1.asm b/gmp-6.3.0/mpn/x86/p6/dive_1.asm
new file mode 100644
index 0000000..7d61a18
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/dive_1.asm
@@ -0,0 +1,267 @@
+dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C odd even divisor
+C P6: 10.0 12.0 cycles/limb
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+C The odd case is basically the same as mpn_modexact_1_odd, just with an
+C extra store, and it runs at the same 10 cycles which is the dependent
+C chain.
+C
+C The shifts for the even case aren't on the dependent chain so in principle
+C it could run the same too, but nothing running at 10 has been found.
+C Perhaps there's too many uops (an extra 4 over the odd case).
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+defframe(VAR_INVERSE, -20)
+deflit(STACK_SPACE, 20)
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+ bsfl %eax, %ecx C trailing twos
+
+ movl %ebp, SAVE_EBP
+
+ shrl %cl, %eax C d without twos
+
+ movl %eax, %edx
+ shrl %eax C d/2 without twos
+
+ movl %edx, PARAM_DIVISOR
+ andl $127, %eax
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %ebp)
+ movzbl (%eax,%ebp), %ebp C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %ebp C inv 8 bits
+')
+
+ leal (%ebp,%ebp), %eax C 2*inv
+
+ imull %ebp, %ebp C inv*inv
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ leal (%esi,%ebx,4), %esi C src end
+
+ imull PARAM_DIVISOR, %ebp C inv*inv*d
+
+ subl %ebp, %eax C inv = 2*inv - inv*inv*d
+ leal (%eax,%eax), %ebp C 2*inv
+
+ imull %eax, %eax C inv*inv
+
+ leal (%edi,%ebx,4), %edi C dst end
+ negl %ebx C -size
+
+ movl %edi, PARAM_DST
+
+ imull PARAM_DIVISOR, %eax C inv*inv*d
+
+ subl %eax, %ebp C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
+ movl PARAM_DIVISOR, %eax
+ imull %ebp, %eax
+ cmpl $1, %eax')
+
+ movl %ebp, VAR_INVERSE
+ movl (%esi,%ebx,4), %eax C src[0]
+
+ orl %ecx, %ecx
+ jnz L(even)
+
+ C ecx initial carry is zero
+ jmp L(odd_entry)
+
+
+C The dependent chain here is
+C
+C subl %edx, %eax 1
+C imull %ebp, %eax 4
+C mull PARAM_DIVISOR 5
+C ----
+C total 10
+C
+C and this is the measured speed. No special scheduling is necessary, out
+C of order execution hides the load latency.
+
+L(odd_top):
+ C eax scratch (src limb)
+ C ebx counter, limbs, negative
+ C ecx carry bit
+ C edx carry limb, high of last product
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp
+
+ mull PARAM_DIVISOR
+
+ movl (%esi,%ebx,4), %eax
+ subl %ecx, %eax
+
+ sbbl %ecx, %ecx
+ subl %edx, %eax
+
+ sbbl $0, %ecx
+
+L(odd_entry):
+ imull VAR_INVERSE, %eax
+
+ movl %eax, (%edi,%ebx,4)
+ negl %ecx
+
+ incl %ebx
+ jnz L(odd_top)
+
+
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBP, %ebp
+
+ movl SAVE_EBX, %ebx
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+L(even):
+ C eax src[0]
+ C ebx counter, limbs, negative
+ C ecx shift
+ C edx
+ C esi
+ C edi
+ C ebp
+
+ xorl %ebp, %ebp C initial carry bit
+ xorl %edx, %edx C initial carry limb (for size==1)
+
+ incl %ebx
+ jz L(even_one)
+
+ movl (%esi,%ebx,4), %edi C src[1]
+
+ shrdl( %cl, %edi, %eax)
+
+ jmp L(even_entry)
+
+
+L(even_top):
+ C eax scratch
+ C ebx counter, limbs, negative
+ C ecx shift
+ C edx scratch
+ C esi &src[size]
+ C edi &dst[size] and scratch
+ C ebp carry bit
+
+ movl (%esi,%ebx,4), %edi
+
+ mull PARAM_DIVISOR
+
+ movl -4(%esi,%ebx,4), %eax
+ shrdl( %cl, %edi, %eax)
+
+ subl %ebp, %eax
+
+ sbbl %ebp, %ebp
+ subl %edx, %eax
+
+ sbbl $0, %ebp
+
+L(even_entry):
+ imull VAR_INVERSE, %eax
+
+ movl PARAM_DST, %edi
+ negl %ebp
+
+ movl %eax, -4(%edi,%ebx,4)
+ incl %ebx
+ jnz L(even_top)
+
+
+
+ mull PARAM_DIVISOR
+
+ movl -4(%esi), %eax
+
+L(even_one):
+ shrl %cl, %eax
+ movl SAVE_ESI, %esi
+
+ subl %ebp, %eax
+ movl SAVE_EBP, %ebp
+
+ subl %edx, %eax
+ movl SAVE_EBX, %ebx
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi)
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86/p6/gcd_11.asm b/gmp-6.3.0/mpn/x86/p6/gcd_11.asm
new file mode 100644
index 0000000..80e055e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/gcd_11.asm
@@ -0,0 +1,83 @@
+dnl x86 mpn_gcd_11 optimised for processors with fast BSF.
+
+dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked by Torbjorn Granlund.
+
+dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2015 Free Software
+dnl Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/bit (approx)
+C AMD K7 7.80
+C AMD K8,K9 7.79
+C AMD K10 4.08
+C AMD bd1 ?
+C AMD bobcat 7.82
+C Intel P4-2 14.9
+C Intel P4-3/4 14.0
+C Intel P6/13 5.09
+C Intel core2 4.22
+C Intel NHM 5.00
+C Intel SBR 5.00
+C Intel atom 17.1
+C VIA nano ?
+C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
+
+
+define(`u0', `%eax')
+define(`v0', `%edx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_gcd_11)
+ push %edi
+ push %esi
+
+ mov 12(%esp), %eax
+ mov 16(%esp), %edx
+ jmp L(odd)
+
+ ALIGN(16) C K10 BD C2 NHM SBR
+L(top): cmovc( %esi, %eax) C u = |v - u| 0,3 0,3 0,6 0,5 0,5
+ cmovc( %edi, %edx) C v = min(u,v) 0,3 0,3 2,8 1,7 1,7
+ shr %cl, %eax C 1,7 1,6 2,8 2,8 2,8
+L(odd): mov %edx, %esi C 1 1 4 3 3
+ sub %eax, %esi C 2 2 5 4 4
+ bsf %esi, %ecx C 3 3 6 5 5
+ mov %eax, %edi C 2 2 3 3 4
+ sub %edx, %eax C 2 2 4 3 4
+ jnz L(top) C
+
+L(end): mov %edx, %eax
+ pop %esi
+ pop %edi
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/p6/gmp-mparam.h b/gmp-6.3.0/mpn/x86/p6/gmp-mparam.h
new file mode 100644
index 0000000..96c96fd
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/gmp-mparam.h
@@ -0,0 +1,194 @@
+/* Intel P6 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2008-2010, 2012 Free Software
+Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the
+ value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard limit in
+ mpn/x86/p6/sqr_basecase.asm. */
+
+
+/* 1867 MHz P6 model 13 */
+
+#define MOD_1_NORM_THRESHOLD 4
+#define MOD_1_UNNORM_THRESHOLD 4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 21
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 74
+#define MUL_TOOM44_THRESHOLD 181
+#define MUL_TOOM6H_THRESHOLD 252
+#define MUL_TOOM8H_THRESHOLD 363
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 115
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 101
+#define SQR_TOOM4_THRESHOLD 154
+#define SQR_TOOM6_THRESHOLD 222
+#define SQR_TOOM8_THRESHOLD 527
+
+#define MULMID_TOOM42_THRESHOLD 58
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define POWM_SEC_TABLE 4,23,258,768,2388
+
+#define MUL_FFT_MODF_THRESHOLD 565 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 31, 7}, { 63, 8}, { 39, 9}, { 23, 5}, \
+ { 383, 4}, { 991, 5}, { 511, 6}, { 267, 7}, \
+ { 157, 8}, { 91, 9}, { 47, 8}, { 111, 9}, \
+ { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
+ { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \
+ { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \
+ { 159,10}, { 335, 9}, { 671,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \
+ { 415,11}, { 223,12}, { 127,11}, { 255,10}, \
+ { 543, 9}, { 1087,11}, { 287,10}, { 607,11}, \
+ { 319,10}, { 671,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,10}, { 831,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \
+ { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \
+ { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \
+ { 1599,11}, { 863,12}, { 447,11}, { 959,13}, \
+ { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
+ { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \
+ { 1471,13}, { 383,12}, { 831,11}, { 1727,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \
+ { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \
+ { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \
+ { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \
+ { 1407,12}, { 2815,14}, { 767,13}, { 1663,12}, \
+ { 3455,13}, { 8192,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 132
+#define MUL_FFT_THRESHOLD 6784
+
+#define SQR_FFT_MODF_THRESHOLD 472 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 472, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \
+ { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
+ { 31, 8}, { 63, 4}, { 1023, 8}, { 67, 9}, \
+ { 39, 5}, { 639, 4}, { 1471, 6}, { 383, 7}, \
+ { 209, 8}, { 119, 9}, { 63, 7}, { 255, 8}, \
+ { 139, 9}, { 71, 8}, { 143, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159, 8}, { 319, 9}, \
+ { 167,10}, { 95,11}, { 63,10}, { 143, 9}, \
+ { 287,10}, { 159,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \
+ { 1087,10}, { 287, 9}, { 575,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \
+ { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 399, 9}, { 799,10}, { 415, 9}, \
+ { 831,11}, { 223,12}, { 127,11}, { 255,10}, \
+ { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \
+ { 1215,11}, { 319,10}, { 671, 9}, { 1343,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,10}, { 831,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \
+ { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \
+ { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \
+ { 447,11}, { 959,13}, { 255,12}, { 511,11}, \
+ { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \
+ { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \
+ { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \
+ { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \
+ { 2111,13}, { 1151,12}, { 2431,13}, { 1407,14}, \
+ { 767,13}, { 1663,12}, { 3455,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 146
+#define SQR_FFT_THRESHOLD 5760
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 33
+#define MULLO_MUL_N_THRESHOLD 13463
+
+#define DC_DIV_QR_THRESHOLD 20
+#define DC_DIVAPPR_Q_THRESHOLD 56
+#define DC_BDIV_QR_THRESHOLD 60
+#define DC_BDIV_Q_THRESHOLD 134
+
+#define INV_MULMOD_BNM1_THRESHOLD 38
+#define INV_NEWTON_THRESHOLD 66
+#define INV_APPR_THRESHOLD 63
+
+#define BINV_NEWTON_THRESHOLD 250
+#define REDC_1_TO_REDC_N_THRESHOLD 63
+
+#define MU_DIV_QR_THRESHOLD 1164
+#define MU_DIVAPPR_Q_THRESHOLD 979
+#define MUPI_DIV_QR_THRESHOLD 38
+#define MU_BDIV_QR_THRESHOLD 1442
+#define MU_BDIV_Q_THRESHOLD 1470
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD_THRESHOLD 64
+#define HGCD_APPR_THRESHOLD 105
+#define HGCD_REDUCE_THRESHOLD 3524
+#define GCD_DC_THRESHOLD 386
+#define GCDEXT_DC_THRESHOLD 309
+#define JACOBI_BASE_METHOD 1
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 26
+#define SET_STR_DC_THRESHOLD 587
+#define SET_STR_PRECOMPUTE_THRESHOLD 1104
diff --git a/gmp-6.3.0/mpn/x86/p6/lshsub_n.asm b/gmp-6.3.0/mpn/x86/p6/lshsub_n.asm
new file mode 100644
index 0000000..7ada213
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/lshsub_n.asm
@@ -0,0 +1,169 @@
+dnl Intel P6 mpn_lshsub_n -- mpn papillion support.
+
+dnl Copyright 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C P6/13: 3.35 cycles/limb (separate mpn_sub_n + mpn_lshift needs 4.12)
+
+C (1) The loop is not scheduled in any way, and scheduling attempts have not
+C improved speed on P6/13. Presumably, the K7 will want scheduling, if it
+C at all wants to use MMX.
+C (2) We could save a register by not alternatingly using eax and edx in the
+C loop.
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebx')
+define(`n', `%ecx')
+define(`cnt', `%mm7')
+
+ASM_START()
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_lshsub_n)
+ push %edi
+ push %esi
+ push %ebx
+
+ mov 16(%esp), rp
+ mov 20(%esp), up
+ mov 24(%esp), vp
+ mov 28(%esp), n
+ mov $32, %eax
+ sub 32(%esp), %eax
+ movd %eax, cnt
+
+ lea (up,n,4), up
+ lea (vp,n,4), vp
+ lea (rp,n,4), rp
+
+ neg n
+ mov n, %eax
+ and $-8, n
+ and $7, %eax
+ shl %eax C eax = 2x
+ lea (%eax,%eax,4), %edx C edx = 10x
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ lea L(ent)(%eax,%edx,2), %eax C eax = 22x
+')
+
+ pxor %mm1, %mm1
+ pxor %mm0, %mm0
+
+ jmp *%eax
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ lea (%eax,%edx,2), %eax
+ add $L(ent)-L(here), %eax
+ add (%esp), %eax
+ ret_internal
+')
+
+L(end): C compute (cy<<cnt) | (edx>>(32-cnt))
+ sbb %eax, %eax
+ neg %eax
+ mov 32(%esp), %ecx
+ shld %cl, %edx, %eax
+
+ emms
+
+ pop %ebx
+ pop %esi
+ pop %edi
+ ret
+ ALIGN(16)
+L(top): jecxz L(end)
+L(ent): mov 0(up,n,4), %eax
+ sbb 0(vp,n,4), %eax
+ movd %eax, %mm0
+ punpckldq %mm0, %mm1
+ psrlq %mm7, %mm1
+ movd %mm1, 0(rp,n,4)
+
+ mov 4(up,n,4), %edx
+ sbb 4(vp,n,4), %edx
+ movd %edx, %mm1
+ punpckldq %mm1, %mm0
+ psrlq %mm7, %mm0
+ movd %mm0, 4(rp,n,4)
+
+ mov 8(up,n,4), %eax
+ sbb 8(vp,n,4), %eax
+ movd %eax, %mm0
+ punpckldq %mm0, %mm1
+ psrlq %mm7, %mm1
+ movd %mm1, 8(rp,n,4)
+
+ mov 12(up,n,4), %edx
+ sbb 12(vp,n,4), %edx
+ movd %edx, %mm1
+ punpckldq %mm1, %mm0
+ psrlq %mm7, %mm0
+ movd %mm0, 12(rp,n,4)
+
+ mov 16(up,n,4), %eax
+ sbb 16(vp,n,4), %eax
+ movd %eax, %mm0
+ punpckldq %mm0, %mm1
+ psrlq %mm7, %mm1
+ movd %mm1, 16(rp,n,4)
+
+ mov 20(up,n,4), %edx
+ sbb 20(vp,n,4), %edx
+ movd %edx, %mm1
+ punpckldq %mm1, %mm0
+ psrlq %mm7, %mm0
+ movd %mm0, 20(rp,n,4)
+
+ mov 24(up,n,4), %eax
+ sbb 24(vp,n,4), %eax
+ movd %eax, %mm0
+ punpckldq %mm0, %mm1
+ psrlq %mm7, %mm1
+ movd %mm1, 24(rp,n,4)
+
+ mov 28(up,n,4), %edx
+ sbb 28(vp,n,4), %edx
+ movd %edx, %mm1
+ punpckldq %mm1, %mm0
+ psrlq %mm7, %mm0
+ movd %mm0, 28(rp,n,4)
+
+ lea 8(n), n
+ jmp L(top)
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm b/gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm
new file mode 100644
index 0000000..5300616
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/mmx/divrem_1.asm
@@ -0,0 +1,767 @@
+dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t inverse,
+C unsigned shift);
+C
+C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
+C see that file for some comments. It's possible what's here can be improved.
+
+
+dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
+dnl
+dnl The different speeds of the integer and fraction parts means that using
+dnl xsize+size isn't quite right. The threshold wants to be a bit higher
+dnl for the integer part and a bit lower for the fraction part. (Or what's
+dnl really wanted is to speed up the integer part!)
+dnl
+dnl The threshold is set to make the integer part right. At 4 limbs the
+dnl div and mul are about the same there, but on the fractional part the
+dnl mul is much faster.
+
+deflit(MUL_THRESHOLD, 4)
+
+
+defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
+defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
+defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC, 12)
+defframe(PARAM_XSIZE, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+
+defframe(VAR_NORM, -20)
+defframe(VAR_INVERSE, -24)
+defframe(VAR_SRC, -28)
+defframe(VAR_DST, -32)
+defframe(VAR_DST_STOP,-36)
+
+deflit(STACK_SPACE, 36)
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_preinv_divrem_1)
+deflit(`FRAME',0)
+ movl PARAM_XSIZE, %ecx
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edx
+
+ movl -4(%esi,%ebx,4), %eax C src high limb
+ xorl %edi, %edi C initial carry (if can't skip a div)
+
+ C
+
+ leal 8(%edx,%ecx,4), %edx C &dst[xsize+2]
+ xor %ecx, %ecx
+
+ movl %edx, VAR_DST_STOP C &dst[xsize+2]
+ cmpl %ebp, %eax C high cmp divisor
+
+ cmovc( %eax, %edi) C high is carry if high<divisor
+
+ cmovnc( %eax, %ecx) C 0 if skip div, src high if not
+ C (the latter in case src==dst)
+
+ movl %ecx, -12(%edx,%ebx,4) C dst high limb
+
+ sbbl $0, %ebx C skip one division if high<divisor
+ movl PARAM_PREINV_SHIFT, %ecx
+
+ leal -8(%edx,%ebx,4), %edx C &dst[xsize+size]
+ movl $32, %eax
+
+ movl %edx, VAR_DST C &dst[xsize+size]
+
+ shll %cl, %ebp C d normalized
+ subl %ecx, %eax
+ movl %ecx, VAR_NORM
+
+ movd %eax, %mm7 C rshift
+ movl PARAM_PREINV_INVERSE, %eax
+ jmp L(start_preinv)
+
+EPILOGUE()
+
+
+
+ ALIGN(16)
+
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+ movl PARAM_CARRY, %edx
+
+ movl PARAM_SIZE, %ecx
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ leal -4(%edi,%ebx,4), %edi
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ C offset 0x31, close enough to aligned
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl $0, %edx C initial carry (if can't skip a div)
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+ orl %ecx, %ecx C size
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
+ jz L(no_skip_div) C if size==0
+
+ movl -4(%esi,%ecx,4), %eax C src high limb
+ xorl %esi, %esi
+ cmpl %ebp, %eax C high cmp divisor
+
+ cmovc( %eax, %edx) C high is carry if high<divisor
+
+ cmovnc( %eax, %esi) C 0 if skip div, src high if not
+ C (the latter in case src==dst)
+
+ movl %esi, (%edi,%ecx,4) C dst high limb
+
+ sbbl $0, %ecx C size-1 if high<divisor
+ movl PARAM_SRC, %esi C reload
+L(no_skip_div):
+
+
+L(start_1c):
+ C eax
+ C ebx xsize
+ C ecx size
+ C edx carry
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ leal (%ebx,%ecx), %eax C size+xsize
+ cmpl $MUL_THRESHOLD, %eax
+ jae L(mul_by_inverse)
+
+ orl %ecx, %ecx
+ jz L(divide_no_integer)
+
+L(divide_integer):
+ C eax scratch (quotient)
+ C ebx xsize
+ C ecx counter
+ C edx scratch (remainder)
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ movl -4(%esi,%ecx,4), %eax
+
+ divl %ebp
+
+ movl %eax, (%edi,%ecx,4)
+ decl %ecx
+ jnz L(divide_integer)
+
+
+L(divide_no_integer):
+ movl PARAM_DST, %edi
+ orl %ebx, %ebx
+ jnz L(divide_fraction)
+
+L(divide_done):
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBX, %ebx
+ movl %edx, %eax
+
+ movl SAVE_EBP, %ebp
+ addl $STACK_SPACE, %esp
+
+ ret
+
+
+L(divide_fraction):
+ C eax scratch (quotient)
+ C ebx counter
+ C ecx
+ C edx scratch (remainder)
+ C esi
+ C edi dst
+ C ebp divisor
+
+ movl $0, %eax
+
+ divl %ebp
+
+ movl %eax, -4(%edi,%ebx,4)
+ decl %ebx
+ jnz L(divide_fraction)
+
+ jmp L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+ C eax
+ C ebx xsize
+ C ecx size
+ C edx carry
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop
+
+ movl %ebx, VAR_DST_STOP
+ leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
+
+ movl %edi, VAR_DST
+ movl %ecx, %ebx C size
+
+ bsrl %ebp, %ecx C 31-l
+ movl %edx, %edi C carry
+
+ leal 1(%ecx), %eax C 32-l
+ xorl $31, %ecx C l
+
+ movl %ecx, VAR_NORM
+ movl $-1, %edx
+
+ shll %cl, %ebp C d normalized
+ movd %eax, %mm7
+
+ movl $-1, %eax
+ subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
+
+ divl %ebp C floor (b*(b-d)-1) / d
+
+L(start_preinv):
+ C eax inverse
+ C ebx size
+ C ecx shift
+ C edx
+ C esi src
+ C edi carry
+ C ebp divisor
+ C
+ C mm7 rshift
+
+ movl %eax, VAR_INVERSE
+ orl %ebx, %ebx C size
+ leal -12(%esi,%ebx,4), %eax C &src[size-3]
+
+ movl %eax, VAR_SRC
+ jz L(start_zero)
+
+ movl 8(%eax), %esi C src high limb
+ cmpl $1, %ebx
+ jz L(start_one)
+
+L(start_two_or_more):
+ movl 4(%eax), %edx C src second highest limb
+
+ shldl( %cl, %esi, %edi) C n2 = carry,high << l
+
+ shldl( %cl, %edx, %esi) C n10 = high,second << l
+
+ cmpl $2, %ebx
+ je L(integer_two_left)
+ jmp L(integer_top)
+
+
+L(start_one):
+ shldl( %cl, %esi, %edi) C n2 = carry,high << l
+
+ shll %cl, %esi C n10 = high << l
+ jmp L(integer_one_left)
+
+
+L(start_zero):
+ C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
+ C skipped a division.
+
+ shll %cl, %edi C n2 = carry << l
+ movl %edi, %eax C return value for zero_done
+ cmpl $0, PARAM_XSIZE
+
+ je L(zero_done)
+ jmp L(fraction_some)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C This loop runs at about 25 cycles, which is probably sub-optimal, and
+C certainly more than the dependent chain would suggest. A better loop, or
+C a better rough analysis of what's possible, would be welcomed.
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C uops
+C n2+n1 1 (addl)
+C mul 5
+C q1+1 3 (addl/adcl)
+C mul 5
+C sub 3 (subl/sbbl)
+C addback 2 (cmov)
+C ---
+C 19
+C
+C Lack of registers hinders explicit scheduling and it might be that the
+C normal out of order execution isn't able to hide enough under the mul
+C latencies.
+C
+C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
+C cmov (and takes one uop off the dependent chain). A sarl/andl/addl
+C combination was tried for the addback (despite the fact it would lengthen
+C the dependent chain) but found to be no faster.
+
+
+ ALIGN(16)
+L(integer_top):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx scratch (src, dst)
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp d
+ C
+ C mm0 scratch (src qword)
+ C mm7 rshift for normalization
+
+ movl %esi, %eax
+ movl %ebp, %ebx
+
+ sarl $31, %eax C -n1
+ movl VAR_SRC, %ecx
+
+ andl %eax, %ebx C -n1 & d
+ negl %eax C n1
+
+ addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
+ addl %edi, %eax C n2+n1
+ movq (%ecx), %mm0 C next src limb and the one below it
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ subl $4, %ecx
+
+ movl %ecx, VAR_SRC
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ movl %ebp, %eax C d
+ leal 1(%edi), %ebx C n2+1
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+ jz L(q1_ff)
+
+ mull %ebx C (q1+1)*d
+
+ movl VAR_DST, %ecx
+ psrlq %mm7, %mm0
+
+ C
+
+ C
+
+ C
+
+ subl %eax, %esi
+ movl VAR_DST_STOP, %eax
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ movd %mm0, %esi
+
+ sbbl $0, %ebx C q
+ subl $4, %ecx
+
+ movl %ebx, (%ecx)
+ cmpl %eax, %ecx
+
+ movl %ecx, VAR_DST
+ jne L(integer_top)
+
+
+L(integer_loop_done):
+
+
+C -----------------------------------------------------------------------------
+C
+C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
+C q1_ff special case. This make the code a bit smaller and simpler, and
+C costs only 2 cycles (each).
+
+L(integer_two_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx scratch (src, dst)
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm7 rshift
+
+
+ movl %esi, %eax
+ movl %ebp, %ebx
+
+ sarl $31, %eax C -n1
+ movl PARAM_SRC, %ecx
+
+ andl %eax, %ebx C -n1 & d
+ negl %eax C n1
+
+ addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
+ addl %edi, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ movd (%ecx), %mm0 C src low limb
+
+ movl VAR_DST_STOP, %ecx
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx
+
+ mull %ebx C (q1+1)*d
+
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+
+ C
+
+ C
+
+ subl %eax, %esi
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+ movd %mm0, %esi
+
+ sbbl $0, %ebx C q
+
+ movl %ebx, -4(%ecx)
+
+
+C -----------------------------------------------------------------------------
+L(integer_one_left):
+ C eax scratch
+ C ebx scratch (nadj, q1)
+ C ecx scratch (dst)
+ C edx scratch
+ C esi n10
+ C edi n2
+ C ebp divisor
+ C
+ C mm7 rshift
+
+
+ movl %esi, %eax
+ movl %ebp, %ebx
+
+ sarl $31, %eax C -n1
+ movl VAR_DST_STOP, %ecx
+
+ andl %eax, %ebx C -n1 & d
+ negl %eax C n1
+
+ addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
+ addl %edi, %eax C n2+n1
+
+ mull VAR_INVERSE C m*(n2+n1)
+
+ C
+
+ C
+
+ C
+
+ addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
+ leal 1(%edi), %ebx C n2+1
+ movl %ebp, %eax C d
+
+ C
+
+ adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
+
+ sbbl $0, %ebx C q1 if q1+1 overflowed
+
+ mull %ebx
+
+ C
+
+ C
+
+ C
+
+ C
+
+ subl %eax, %esi
+ movl PARAM_XSIZE, %eax
+
+ sbbl %edx, %edi C n - (q1+1)*d
+ movl %esi, %edi C remainder -> n2
+ leal (%ebp,%esi), %edx
+
+ cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
+
+ sbbl $0, %ebx C q
+
+ movl %ebx, -8(%ecx)
+ subl $8, %ecx
+
+
+
+ orl %eax, %eax C xsize
+ jnz L(fraction_some)
+
+ movl %edi, %eax
+L(fraction_done):
+ movl VAR_NORM, %ecx
+L(zero_done):
+ movl SAVE_EBP, %ebp
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EBX, %ebx
+ addl $STACK_SPACE, %esp
+
+ shrl %cl, %eax
+ emms
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+C
+C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
+C of q*d is simply -d and the remainder n-q*d = n10+d
+
+L(q1_ff):
+ C eax (divisor)
+ C ebx (q1+1 == 0)
+ C ecx
+ C edx
+ C esi n10
+ C edi n2
+ C ebp divisor
+
+ movl VAR_DST, %ecx
+ movl VAR_DST_STOP, %edx
+ subl $4, %ecx
+
+ movl %ecx, VAR_DST
+ psrlq %mm7, %mm0
+ leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
+
+ movl $-1, (%ecx)
+ movd %mm0, %esi C next n10
+
+ cmpl %ecx, %edx
+ jne L(integer_top)
+
+ jmp L(integer_loop_done)
+
+
+
+C -----------------------------------------------------------------------------
+C
+C In the current implementation, the following successively dependent
+C micro-ops seem to exist.
+C
+C uops
+C mul 5
+C q1+1 1 (addl)
+C mul 5
+C sub 3 (negl/sbbl)
+C addback 2 (cmov)
+C ---
+C 16
+C
+C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for
+C the addback was found to be a touch slower.
+
+
+ ALIGN(16)
+L(fraction_some):
+ C eax
+ C ebx
+ C ecx
+ C edx
+ C esi
+ C edi carry
+ C ebp divisor
+
+ movl PARAM_DST, %esi
+ movl VAR_DST_STOP, %ecx C &dst[xsize+2]
+ movl %edi, %eax
+
+ subl $8, %ecx C &dst[xsize]
+
+
+ ALIGN(16)
+L(fraction_top):
+ C eax n2, then scratch
+ C ebx scratch (nadj, q1)
+ C ecx dst, decrementing
+ C edx scratch
+ C esi dst stop point
+ C edi n2
+ C ebp divisor
+
+ mull VAR_INVERSE C m*n2
+
+ movl %ebp, %eax C d
+ subl $4, %ecx C dst
+ leal 1(%edi), %ebx
+
+ C
+
+ C
+
+ C
+
+ addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
+
+ mull %ebx C (q1+1)*d
+
+ C
+
+ C
+
+ C
+
+ C
+
+ negl %eax C low of n - (q1+1)*d
+
+ sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
+ leal (%ebp,%eax), %edx
+
+ cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
+
+ sbbl $0, %ebx C q
+ movl %eax, %edi C remainder->n2
+ cmpl %esi, %ecx
+
+ movl %ebx, (%ecx) C previous q
+ jne L(fraction_top)
+
+
+ jmp L(fraction_done)
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h b/gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h
new file mode 100644
index 0000000..ef29061
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/mmx/gmp-mparam.h
@@ -0,0 +1,218 @@
+/* Intel P6/mmx gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991-2017 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the
+ value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard limit in
+ mpn/x86/p6/sqr_basecase.asm. */
+
+
+/* 800 MHz P6 model 8 */
+/* Generated by tuneup.c, 2017-02-03, gcc 4.8 */
+
+#define MOD_1_1P_METHOD 2
+#define MOD_1_NORM_THRESHOLD 3
+#define MOD_1_UNNORM_THRESHOLD 4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 30
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 14
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 1
+#define DIV_QR_1_NORM_THRESHOLD 4
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 62
+
+#define DIV_1_VS_MUL_1_PERCENT 168
+
+#define MUL_TOOM22_THRESHOLD 22
+#define MUL_TOOM33_THRESHOLD 73
+#define MUL_TOOM44_THRESHOLD 195
+#define MUL_TOOM6H_THRESHOLD 254
+#define MUL_TOOM8H_THRESHOLD 381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 81
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 122
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 73
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 100
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 30 /* WRONG value, see comment above */
+#define SQR_TOOM3_THRESHOLD 83
+#define SQR_TOOM4_THRESHOLD 196
+#define SQR_TOOM6_THRESHOLD 214
+#define SQR_TOOM8_THRESHOLD 381
+
+#define MULMID_TOOM42_THRESHOLD 56
+
+#define MULMOD_BNM1_THRESHOLD 16
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 476, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 21, 7}, { 11, 6}, { 25, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
+ { 11, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \
+ { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
+ { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \
+ { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \
+ { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 167,10}, { 95, 9}, { 199,10}, \
+ { 111,11}, { 63,10}, { 127, 9}, { 255, 8}, \
+ { 511,10}, { 143, 9}, { 287, 8}, { 575,10}, \
+ { 159,11}, { 95,10}, { 191, 9}, { 383,10}, \
+ { 207,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543, 8}, { 1087,10}, \
+ { 287, 9}, { 575,11}, { 159,10}, { 319, 9}, \
+ { 639,10}, { 351, 9}, { 703,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 415, 9}, { 831,11}, \
+ { 223,10}, { 447,12}, { 127,11}, { 255,10}, \
+ { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \
+ { 1215,11}, { 319,10}, { 671,11}, { 351,10}, \
+ { 703,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,10}, { 831,11}, { 447,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \
+ { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \
+ { 703,10}, { 1407,11}, { 735,12}, { 383,11}, \
+ { 831,12}, { 447,11}, { 959,10}, { 1919,13}, \
+ { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
+ { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \
+ { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \
+ { 1535,12}, { 831,11}, { 1727,12}, { 959,11}, \
+ { 1919,14}, { 255,13}, { 511,12}, { 1215,11}, \
+ { 2431,13}, { 639,12}, { 1471,11}, { 2943,13}, \
+ { 767,12}, { 1727,13}, { 895,12}, { 1919,11}, \
+ { 3839,14}, { 511,13}, { 1023,12}, { 2111,13}, \
+ { 1151,12}, { 2431,13}, { 1279,12}, { 2559,13}, \
+ { 1407,12}, { 2943,14}, { 767,13}, { 1663,12}, \
+ { 3327,13}, { 1919,12}, { 3839,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 160
+#define MUL_FFT_THRESHOLD 7040
+
+#define SQR_FFT_MODF_THRESHOLD 376 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 376, 5}, { 21, 6}, { 11, 5}, { 23, 6}, \
+ { 21, 7}, { 11, 6}, { 24, 7}, { 13, 6}, \
+ { 27, 7}, { 15, 6}, { 31, 7}, { 21, 8}, \
+ { 11, 7}, { 27, 8}, { 15, 7}, { 33, 8}, \
+ { 19, 7}, { 39, 8}, { 23, 7}, { 47, 8}, \
+ { 27, 9}, { 15, 8}, { 39, 9}, { 23, 8}, \
+ { 51,10}, { 15, 9}, { 31, 8}, { 67, 9}, \
+ { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \
+ { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 127, 8}, \
+ { 255, 9}, { 135,10}, { 79, 9}, { 167,10}, \
+ { 95, 9}, { 191, 8}, { 383,10}, { 111,11}, \
+ { 63,10}, { 127, 9}, { 255, 8}, { 511, 9}, \
+ { 271,10}, { 143, 9}, { 287, 8}, { 575, 9}, \
+ { 303, 8}, { 607,10}, { 159, 9}, { 319,11}, \
+ { 95,10}, { 191, 9}, { 383,10}, { 207,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \
+ { 271, 9}, { 543,10}, { 287, 9}, { 575,10}, \
+ { 303,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 415, 9}, { 831,11}, { 223,10}, \
+ { 479,12}, { 127,11}, { 255,10}, { 543, 9}, \
+ { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \
+ { 319,10}, { 671,11}, { 351,10}, { 703,12}, \
+ { 191,11}, { 383,10}, { 767,11}, { 415,10}, \
+ { 831,11}, { 479,13}, { 127,12}, { 255,11}, \
+ { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \
+ { 319,11}, { 671,10}, { 1343,11}, { 703,10}, \
+ { 1407,11}, { 735,12}, { 383,11}, { 831,12}, \
+ { 447,11}, { 959,10}, { 1919,13}, { 255,12}, \
+ { 511,11}, { 1087,12}, { 575,11}, { 1215,10}, \
+ { 2431,12}, { 639,11}, { 1343,12}, { 703,11}, \
+ { 1407,13}, { 383,12}, { 831,11}, { 1727,12}, \
+ { 959,11}, { 1919,14}, { 255,13}, { 511,12}, \
+ { 1215,11}, { 2431,13}, { 639,12}, { 1471,11}, \
+ { 2943,13}, { 767,12}, { 1727,13}, { 895,12}, \
+ { 1919,11}, { 3839,14}, { 511,13}, { 1023,12}, \
+ { 2111,13}, { 1151,12}, { 2431,13}, { 1407,12}, \
+ { 2943,14}, { 767,13}, { 1535,12}, { 3071,13}, \
+ { 1663,12}, { 3455,13}, { 1919,12}, { 3839,15}, \
+ { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 161
+#define SQR_FFT_THRESHOLD 3712
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 62
+#define MULLO_MUL_N_THRESHOLD 13463
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 177
+#define SQRLO_SQR_THRESHOLD 8937
+
+#define DC_DIV_QR_THRESHOLD 80
+#define DC_DIVAPPR_Q_THRESHOLD 240
+#define DC_BDIV_QR_THRESHOLD 76
+#define DC_BDIV_Q_THRESHOLD 166
+
+#define INV_MULMOD_BNM1_THRESHOLD 42
+#define INV_NEWTON_THRESHOLD 262
+#define INV_APPR_THRESHOLD 250
+
+#define BINV_NEWTON_THRESHOLD 272
+#define REDC_1_TO_REDC_N_THRESHOLD 72
+
+#define MU_DIV_QR_THRESHOLD 1499
+#define MU_DIVAPPR_Q_THRESHOLD 1470
+#define MUPI_DIV_QR_THRESHOLD 124
+#define MU_BDIV_QR_THRESHOLD 1142
+#define MU_BDIV_Q_THRESHOLD 1341
+
+#define POWM_SEC_TABLE 1,16,96,416,1259
+
+#define GET_STR_DC_THRESHOLD 14
+#define GET_STR_PRECOMPUTE_THRESHOLD 27
+#define SET_STR_DC_THRESHOLD 270
+#define SET_STR_PRECOMPUTE_THRESHOLD 1084
+
+#define FAC_DSC_THRESHOLD 194
+#define FAC_ODD_THRESHOLD 25
+
+#define MATRIX22_STRASSEN_THRESHOLD 16
+#define HGCD_THRESHOLD 124
+#define HGCD_APPR_THRESHOLD 152
+#define HGCD_REDUCE_THRESHOLD 3014
+#define GCD_DC_THRESHOLD 474
+#define GCDEXT_DC_THRESHOLD 321
+#define JACOBI_BASE_METHOD 1
diff --git a/gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm b/gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm
new file mode 100644
index 0000000..febd1c0
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/mmx/lshift.asm
@@ -0,0 +1,38 @@
+dnl Intel Pentium-II mpn_lshift -- mpn left shift.
+
+dnl Copyright 2001 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+dnl The P55 code runs well on P-II/III, but could stand some minor tweaks
+dnl at some stage probably.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_lshift)
+include_mpn(`x86/pentium/mmx/lshift.asm')
diff --git a/gmp-6.3.0/mpn/x86/p6/mmx/popham.asm b/gmp-6.3.0/mpn/x86/p6/mmx/popham.asm
new file mode 100644
index 0000000..fd340e4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/mmx/popham.asm
@@ -0,0 +1,39 @@
+dnl Intel Pentium-II mpn_popcount, mpn_hamdist -- population count and
+dnl hamming distance.
+
+dnl Copyright 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6MMX: popcount 11 cycles/limb (approx), hamdist 11.5 cycles/limb (approx)
+
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k6/mmx/popham.asm')
diff --git a/gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm b/gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm
new file mode 100644
index 0000000..77aa190
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/mmx/rshift.asm
@@ -0,0 +1,38 @@
+dnl Intel Pentium-II mpn_rshift -- mpn left shift.
+
+dnl Copyright 2001 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+dnl The P55 code runs well on P-II/III, but could stand some minor tweaks
+dnl at some stage probably.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_rshift)
+include_mpn(`x86/pentium/mmx/rshift.asm')
diff --git a/gmp-6.3.0/mpn/x86/p6/mod_34lsub1.asm b/gmp-6.3.0/mpn/x86/p6/mod_34lsub1.asm
new file mode 100644
index 0000000..b88ab5d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/mod_34lsub1.asm
@@ -0,0 +1,190 @@
+dnl Intel P6 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl Copyright 2000-2002, 2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: 2.0 cycles/limb
+
+C TODO
+C Experiments with more unrolling indicate that 1.5 c/l is possible on P6-13
+C with the current carry handling scheme.
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C Groups of three limbs are handled, with carry bits from 0mod3 into 1mod3
+C into 2mod3, but at that point going into a separate carries total so we
+C don't keep the carry flag live across the loop control. Avoiding decl
+C lets us get to 2.0 c/l, as compared to the generic x86 code at 3.66.
+C
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX, `PARAM_SIZE')
+define(SAVE_ESI, `PARAM_SRC')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %edx
+
+ subl $2, %ecx C size-2
+ movl (%edx), %eax C src[0]
+ ja L(three_or_more)
+ jb L(one)
+
+ C size==2
+
+ movl 4(%edx), %ecx C src[1]
+
+ movl %eax, %edx C src[0]
+ shrl $24, %eax C src[0] high
+
+ andl $0xFFFFFF, %edx C src[0] low
+
+ addl %edx, %eax
+ movl %ecx, %edx C src[1]
+ shrl $16, %ecx C src[1] high
+
+ andl $0xFFFF, %edx
+ addl %ecx, %eax
+
+ shll $8, %edx C src[1] low
+
+ addl %edx, %eax
+L(one):
+ ret
+
+
+L(three_or_more):
+ C eax src[0], initial acc 0mod3
+ C ebx
+ C ecx size-2
+ C edx src
+ C esi
+ C edi
+ C ebp
+
+ movl %ebx, SAVE_EBX
+ movl 4(%edx), %ebx C src[1], initial 1mod3
+ subl $3, %ecx C size-5
+
+ movl %esi, SAVE_ESI
+ movl 8(%edx), %esi C src[2], initial 2mod3
+
+ pushl %edi FRAME_pushl()
+ movl $0, %edi C initial carries 0mod3
+ jng L(done) C if size < 6
+
+
+L(top):
+ C eax acc 0mod3
+ C ebx acc 1mod3
+ C ecx counter, limbs
+ C edx src
+ C esi acc 2mod3
+ C edi carrys into 0mod3
+ C ebp
+
+ addl 12(%edx), %eax
+ adcl 16(%edx), %ebx
+ adcl 20(%edx), %esi
+ leal 12(%edx), %edx
+ adcl $0, %edi
+
+ subl $3, %ecx
+ jg L(top) C at least 3 more to process
+
+
+L(done):
+ C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs respectively
+ cmpl $-1, %ecx
+ jl L(done_0) C if -2, meaning 0 more limbs
+
+ C 1 or 2 more limbs
+ movl $0, %ecx
+ je L(done_1) C if -1, meaning 1 more limb only
+ movl 16(%edx), %ecx
+L(done_1):
+ addl 12(%edx), %eax C 0mod3
+ adcl %ecx, %ebx C 1mod3
+ adcl $0, %esi C 2mod3
+ adcl $0, %edi C carries 0mod3
+
+L(done_0):
+ C eax acc 0mod3
+ C ebx acc 1mod3
+ C ecx
+ C edx
+ C esi acc 2mod3
+ C edi carries 0mod3
+ C ebp
+
+ movl %eax, %ecx C 0mod3
+ shrl $24, %eax C 0mod3 high initial total
+
+ andl $0xFFFFFF, %ecx C 0mod3 low
+ movl %edi, %edx C carries
+ shrl $24, %edi C carries high
+
+ addl %ecx, %eax C add 0mod3 low
+ andl $0xFFFFFF, %edx C carries 0mod3 low
+ movl %ebx, %ecx C 1mod3
+
+ shrl $16, %ebx C 1mod3 high
+ addl %edi, %eax C add carries high
+ addl %edx, %eax C add carries 0mod3 low
+
+ andl $0xFFFF, %ecx C 1mod3 low mask
+ addl %ebx, %eax C add 1mod3 high
+ movl SAVE_EBX, %ebx
+
+ shll $8, %ecx C 1mod3 low
+ movl %esi, %edx C 2mod3
+ popl %edi FRAME_popl()
+
+ shrl $8, %esi C 2mod3 high
+ andl $0xFF, %edx C 2mod3 low mask
+ addl %ecx, %eax C add 1mod3 low
+
+ shll $16, %edx C 2mod3 low
+ addl %esi, %eax C add 2mod3 high
+ movl SAVE_ESI, %esi
+
+ addl %edx, %eax C add 2mod3 low
+
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/p6/mode1o.asm b/gmp-6.3.0/mpn/x86/p6/mode1o.asm
new file mode 100644
index 0000000..7083195
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/mode1o.asm
@@ -0,0 +1,170 @@
+dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: 10.0 cycles/limb
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C
+C It's not worth skipping a step at the end when high<divisor since the main
+C loop is only 10 cycles.
+
+defframe(PARAM_CARRY, 16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+dnl Not enough room under modexact_1 to make these re-use the parameter
+dnl space, unfortunately.
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+deflit(STACK_SPACE, 12)
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+ movl PARAM_CARRY, %ecx
+ jmp L(start_1c)
+
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+ xorl %ecx, %ecx
+L(start_1c):
+ movl PARAM_DIVISOR, %eax
+
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ shrl %eax C d/2
+ movl %edi, SAVE_EDI
+
+ andl $127, %eax
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %edi)
+ movzbl (%eax,%edi), %edi C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %edi C inv 8 bits
+')
+
+ xorl %edx, %edx C initial extra carry
+ leal (%edi,%edi), %eax C 2*inv
+
+ imull %edi, %edi C inv*inv
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+ imull PARAM_DIVISOR, %edi C inv*inv*d
+
+ subl %edi, %eax C inv = 2*inv - inv*inv*d
+ leal (%eax,%eax), %edi C 2*inv
+
+ imull %eax, %eax C inv*inv
+
+ imull PARAM_DIVISOR, %eax C inv*inv*d
+
+ leal (%esi,%ebx,4), %esi C src end
+ negl %ebx C -size
+
+ subl %eax, %edi C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
+ movl PARAM_DIVISOR, %eax
+ imull %edi, %eax
+ cmpl $1, %eax')
+
+
+C The dependent chain here is
+C
+C subl %edx, %eax 1
+C imull %edi, %eax 4
+C mull PARAM_DIVISOR 5
+C ----
+C total 10
+C
+C and this is the measured speed. No special scheduling is necessary, out
+C of order execution hides the load latency.
+
+L(top):
+ C eax scratch (src limb)
+ C ebx counter, limbs, negative
+ C ecx carry bit, 0 or 1
+ C edx carry limb, high of last product
+ C esi &src[size]
+ C edi inverse
+ C ebp
+
+ movl (%esi,%ebx,4), %eax
+ subl %ecx, %eax
+
+ sbbl %ecx, %ecx
+ subl %edx, %eax
+
+ sbbl $0, %ecx
+
+ imull %edi, %eax
+
+ negl %ecx
+
+ mull PARAM_DIVISOR
+
+ incl %ebx
+ jnz L(top)
+
+
+ movl SAVE_ESI, %esi
+ leal (%ecx,%edx), %eax
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBX, %ebx
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86/p6/mul_basecase.asm b/gmp-6.3.0/mpn/x86/p6/mul_basecase.asm
new file mode 100644
index 0000000..d87bc12
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/mul_basecase.asm
@@ -0,0 +1,607 @@
+dnl Intel P6 mpn_mul_basecase -- multiply two mpn numbers.
+
+dnl Copyright 1999-2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: approx 6.5 cycles per cross product (16 limbs/loop unrolling).
+
+
+dnl P6 UNROLL_COUNT cycles/product (approx)
+dnl 8 7
+dnl 16 6.5
+dnl 32 6.4
+dnl Maximum possible with the current code is 32.
+
+deflit(UNROLL_COUNT, 16)
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C mp_srcptr xp, mp_size_t xsize,
+C mp_srcptr yp, mp_size_t ysize);
+C
+C This routine is essentially the same as mpn/generic/mul_basecase.c, but
+C it's faster because it does most of the mpn_addmul_1() startup
+C calculations only once.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 5)
+',`
+deflit(UNROLL_THRESHOLD, 5)
+')
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP, 16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_XSIZE, %ecx
+
+ movl PARAM_YP, %eax
+
+ movl PARAM_XP, %edx
+
+ movl (%eax), %eax C yp[0]
+ cmpl $2, %ecx
+ ja L(xsize_more_than_two)
+ je L(two_by_something)
+
+
+ C one limb by one limb
+
+ mull (%edx)
+
+ movl PARAM_WP, %ecx
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(two_by_something):
+deflit(`FRAME',0)
+
+dnl re-use parameter space
+define(SAVE_EBX, `PARAM_XSIZE')
+define(SAVE_ESI, `PARAM_YSIZE')
+
+ movl %ebx, SAVE_EBX
+ cmpl $1, PARAM_YSIZE
+ movl %eax, %ecx C yp[0]
+
+ movl %esi, SAVE_ESI C save esi
+ movl PARAM_WP, %ebx
+ movl %edx, %esi C xp
+
+ movl (%edx), %eax C xp[0]
+ jne L(two_by_two)
+
+
+ C two limbs by one limb
+ C
+ C eax xp[0]
+ C ebx wp
+ C ecx yp[0]
+ C edx
+ C esi xp
+
+ mull %ecx
+
+ movl %eax, (%ebx)
+ movl 4(%esi), %eax
+ movl %edx, %esi C carry
+
+ mull %ecx
+
+ addl %eax, %esi
+
+ movl %esi, 4(%ebx)
+ movl SAVE_ESI, %esi
+
+ adcl $0, %edx
+
+ movl %edx, 8(%ebx)
+ movl SAVE_EBX, %ebx
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+
+ ALIGN(16)
+L(two_by_two):
+ C eax xp[0]
+ C ebx wp
+ C ecx yp[0]
+ C edx
+ C esi xp
+ C edi
+ C ebp
+
+dnl more parameter space re-use
+define(SAVE_EDI, `PARAM_WP')
+
+ mull %ecx C xp[0] * yp[0]
+
+ movl %edi, SAVE_EDI
+ movl %edx, %edi C carry, for wp[1]
+
+ movl %eax, (%ebx)
+ movl 4(%esi), %eax
+
+ mull %ecx C xp[1] * yp[0]
+
+ addl %eax, %edi
+ movl PARAM_YP, %ecx
+
+ adcl $0, %edx
+ movl 4(%ecx), %ecx C yp[1]
+
+ movl %edi, 4(%ebx)
+ movl 4(%esi), %eax C xp[1]
+ movl %edx, %edi C carry, for wp[2]
+
+ mull %ecx C xp[1] * yp[1]
+
+ addl %eax, %edi
+ movl (%esi), %eax C xp[0]
+
+ adcl $0, %edx
+ movl %edx, %esi C carry, for wp[3]
+
+ mull %ecx C xp[0] * yp[1]
+
+ addl %eax, 4(%ebx)
+ movl %esi, %eax
+
+ adcl %edx, %edi
+ movl SAVE_ESI, %esi
+
+ movl %edi, 8(%ebx)
+
+ adcl $0, %eax
+ movl SAVE_EDI, %edi
+
+ movl %eax, 12(%ebx)
+ movl SAVE_EBX, %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(xsize_more_than_two):
+
+C The first limb of yp is processed with a simple mpn_mul_1 loop running at
+C about 6.2 c/l. Unrolling this doesn't seem worthwhile since it's only run
+C once (whereas the addmul_1 below is run ysize-1 many times). A call to
+C mpn_mul_1 would be slowed down by the parameter pushing and popping etc,
+C and doesn't seem likely to be worthwhile on the typical sizes reaching
+C here from the Karatsuba code.
+
+ C eax yp[0]
+ C ebx
+ C ecx xsize
+ C edx xp
+ C esi
+ C edi
+ C ebp
+
+defframe(`SAVE_EBX', -4)
+defframe(`SAVE_ESI', -8)
+defframe(`SAVE_EDI', -12)
+defframe(`SAVE_EBP', -16)
+defframe(VAR_COUNTER, -20) dnl for use in the unroll case
+defframe(VAR_ADJUST, -24)
+defframe(VAR_JMP, -28)
+defframe(VAR_SWAP, -32)
+defframe(VAR_XP_LOW, -36)
+deflit(STACK_SPACE, 36)
+
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %edi, SAVE_EDI
+ movl PARAM_WP, %edi
+
+ movl %ebx, SAVE_EBX
+
+ movl %ebp, SAVE_EBP
+ movl %eax, %ebp
+
+ movl %esi, SAVE_ESI
+ xorl %ebx, %ebx
+ leal (%edx,%ecx,4), %esi C xp end
+
+ leal (%edi,%ecx,4), %edi C wp end of mul1
+ negl %ecx
+
+
+L(mul1):
+ C eax scratch
+ C ebx carry
+ C ecx counter, negative
+ C edx scratch
+ C esi xp end
+ C edi wp end of mul1
+ C ebp multiplier
+
+ movl (%esi,%ecx,4), %eax
+
+ mull %ebp
+
+ addl %ebx, %eax
+ movl %eax, (%edi,%ecx,4)
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+ incl %ecx
+ jnz L(mul1)
+
+
+ movl PARAM_YSIZE, %edx
+
+ movl %ebx, (%edi) C final carry
+ movl PARAM_XSIZE, %ecx
+ decl %edx
+
+ jz L(done) C if ysize==1
+
+ cmpl $UNROLL_THRESHOLD, %ecx
+ movl PARAM_YP, %eax
+ jae L(unroll)
+
+
+C -----------------------------------------------------------------------------
+ C simple addmul looping
+ C
+ C eax yp
+ C ebx
+ C ecx xsize
+ C edx ysize-1
+ C esi xp end
+ C edi wp end of mul1
+ C ebp
+
+ leal 4(%eax,%edx,4), %ebp C yp end
+ negl %ecx
+ negl %edx
+
+ movl %edx, PARAM_YSIZE C -(ysize-1)
+ movl (%esi,%ecx,4), %eax C xp low limb
+ incl %ecx
+
+ movl %ecx, PARAM_XSIZE C -(xsize-1)
+ xorl %ebx, %ebx C initial carry
+
+ movl %ebp, PARAM_YP
+ movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier
+ jmp L(simple_outer_entry)
+
+
+L(simple_outer_top):
+ C ebp ysize counter, negative
+
+ movl PARAM_YP, %edx
+
+ movl PARAM_XSIZE, %ecx C -(xsize-1)
+ xorl %ebx, %ebx C carry
+
+ movl %ebp, PARAM_YSIZE
+ addl $4, %edi C next position in wp
+
+ movl (%edx,%ebp,4), %ebp C yp limb - multiplier
+
+ movl -4(%esi,%ecx,4), %eax C xp low limb
+
+
+L(simple_outer_entry):
+
+L(simple_inner_top):
+ C eax xp limb
+ C ebx carry limb
+ C ecx loop counter (negative)
+ C edx scratch
+ C esi xp end
+ C edi wp end
+ C ebp multiplier
+
+ mull %ebp
+
+ addl %eax, %ebx
+ adcl $0, %edx
+
+ addl %ebx, (%edi,%ecx,4)
+ movl (%esi,%ecx,4), %eax
+ adcl $0, %edx
+
+ incl %ecx
+ movl %edx, %ebx
+ jnz L(simple_inner_top)
+
+
+ C separate code for last limb so outer loop counter handling can be
+ C interleaved
+
+ mull %ebp
+
+ movl PARAM_YSIZE, %ebp
+ addl %eax, %ebx
+
+ adcl $0, %edx
+
+ addl %ebx, (%edi)
+
+ adcl $0, %edx
+ incl %ebp
+
+ movl %edx, 4(%edi)
+ jnz L(simple_outer_top)
+
+
+L(done):
+ movl SAVE_EBX, %ebx
+
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBP, %ebp
+ addl $FRAME, %esp
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+C The unrolled loop is the same as in mpn_addmul_1, see that code for some
+C comments.
+C
+C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
+C increment xp and wp. This is used to adjust xp and wp, and is rshifted to
+C given an initial VAR_COUNTER at the top of the outer loop.
+C
+C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
+C up to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled loop.
+C
+C VAR_SWAP is 0 if xsize odd or 0xFFFFFFFF if xsize even, used to swap the
+C initial ebx and ecx on entry to the unrolling.
+C
+C VAR_XP_LOW is the least significant limb of xp, which is needed at the
+C start of the unrolled loop.
+C
+C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
+C inclusive.
+C
+C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
+C added to give the location of the next limb of yp, which is the multiplier
+C in the unrolled loop.
+C
+C The trick with the VAR_ADJUST value means it's only necessary to do one
+C fetch in the outer loop to take care of xp, wp and the inner loop counter.
+
+
+L(unroll):
+ C eax yp
+ C ebx
+ C ecx xsize
+ C edx ysize-1
+ C esi xp end
+ C edi wp end of mul1
+ C ebp
+
+ movl PARAM_XP, %esi
+
+ movl 4(%eax), %ebp C multiplier (yp second limb)
+ leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing
+
+ movl %eax, PARAM_YP
+ movl PARAM_WP, %edi
+ negl %edx
+
+ movl %edx, PARAM_YSIZE
+ leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1
+ decl %ecx C xsize-1
+
+ movl (%esi), %eax C xp low limb
+ andl $-UNROLL_MASK-1, %ebx
+ negl %ecx C -(xsize-1)
+
+ negl %ebx
+ andl $UNROLL_MASK, %ecx
+
+ movl %ebx, VAR_ADJUST
+ movl %ecx, %edx
+ shll $4, %ecx
+
+ movl %eax, VAR_XP_LOW
+ sarl $UNROLL_LOG2, %ebx
+ negl %edx
+
+ C 15 code bytes per limb
+ifdef(`PIC',`
+ call L(pic_calc)
+L(unroll_here):
+',`
+ leal L(unroll_inner_entry) (%ecx,%edx,1), %ecx
+')
+
+ movl %ecx, VAR_JMP
+ movl %edx, %ecx
+ shll $31, %edx
+
+ sarl $31, %edx C 0 or -1 as xsize odd or even
+ leal 4(%edi,%ecx,4), %edi C wp and xp, adjust for unrolling,
+ leal 4(%esi,%ecx,4), %esi C and start at second limb
+
+ movl %edx, VAR_SWAP
+ jmp L(unroll_outer_entry)
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ leal (%ecx,%edx,1), %ecx
+ addl $L(unroll_inner_entry)-L(unroll_here), %ecx
+ addl (%esp), %ecx
+ ret_internal
+')
+
+
+C --------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll_outer_top):
+ C eax
+ C ebx
+ C ecx
+ C edx
+ C esi xp + offset
+ C edi wp + offset
+ C ebp ysize counter, negative
+
+ movl VAR_ADJUST, %ebx
+ movl PARAM_YP, %edx
+
+ movl VAR_XP_LOW, %eax
+ movl %ebp, PARAM_YSIZE C store incremented ysize counter
+
+ leal eval(UNROLL_BYTES + 4) (%edi,%ebx,4), %edi
+ leal (%esi,%ebx,4), %esi
+ sarl $UNROLL_LOG2, %ebx
+
+ movl (%edx,%ebp,4), %ebp C yp next multiplier
+
+L(unroll_outer_entry):
+ mull %ebp
+
+ movl %ebx, VAR_COUNTER
+ movl %edx, %ebx C carry high
+ movl %eax, %ecx C carry low
+
+ xorl %edx, %eax
+ movl VAR_JMP, %edx
+
+ andl VAR_SWAP, %eax
+
+ xorl %eax, %ebx C carries other way for odd index
+ xorl %eax, %ecx
+
+ jmp *%edx
+
+
+C -----------------------------------------------------------------------------
+
+L(unroll_inner_top):
+ C eax xp limb
+ C ebx carry high
+ C ecx carry low
+ C edx scratch
+ C esi xp+8
+ C edi wp
+ C ebp yp multiplier limb
+ C
+ C VAR_COUNTER loop counter, negative
+ C
+ C 15 bytes each limb
+
+ addl $UNROLL_BYTES, %edi
+
+L(unroll_inner_entry):
+
+deflit(CHUNK_COUNT,2)
+forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
+ deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
+ deflit(`disp1', eval(disp0 + 4))
+
+Zdisp( movl, disp0,(%esi), %eax)
+ mull %ebp
+Zdisp( addl, %ecx, disp0,(%edi))
+ adcl %eax, %ebx C new carry low
+ movl %edx, %ecx
+ adcl $0, %ecx C new carry high
+
+ movl disp1(%esi), %eax
+ mull %ebp
+ addl %ebx, disp1(%edi)
+ adcl %eax, %ecx C new carry low
+ movl %edx, %ebx
+ adcl $0, %ebx C new carry high
+')
+
+
+ incl VAR_COUNTER
+ leal UNROLL_BYTES(%esi), %esi
+ jnz L(unroll_inner_top)
+
+
+ C eax
+ C ebx carry high
+ C ecx carry low
+ C edx
+ C esi
+ C edi wp, pointing at second last limb)
+ C ebp
+
+deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
+deflit(`disp1', eval(disp0 + 4))
+
+ movl PARAM_YSIZE, %ebp
+ addl %ecx, disp0(%edi) C carry low
+
+ adcl $0, %ebx
+ incl %ebp
+
+ movl %ebx, disp1(%edi) C carry high
+ jnz L(unroll_outer_top)
+
+
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EBP, %ebp
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBX, %ebx
+ addl $FRAME, %esp
+
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/p6/p3mmx/popham.asm b/gmp-6.3.0/mpn/x86/p6/p3mmx/popham.asm
new file mode 100644
index 0000000..db2f260
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/p3mmx/popham.asm
@@ -0,0 +1,42 @@
+dnl Intel Pentium-III mpn_popcount, mpn_hamdist -- population count and
+dnl hamming distance.
+
+dnl Copyright 2000, 2002, 2004, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C popcount hamdist
+C P3 generic 6.5 7
+C P3 model 9 (Banias) ? ?
+C P3 model 13 (Dothan) 5.75 6
+
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+include_mpn(`x86/k7/mmx/popham.asm')
diff --git a/gmp-6.3.0/mpn/x86/p6/sqr_basecase.asm b/gmp-6.3.0/mpn/x86/p6/sqr_basecase.asm
new file mode 100644
index 0000000..8fc7fdf
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/sqr_basecase.asm
@@ -0,0 +1,649 @@
+dnl Intel P6 mpn_sqr_basecase -- square an mpn number.
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
+C product (measured on the speed difference between 20 and 40 limbs,
+C which is the Karatsuba recursing range).
+
+
+dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
+dnl a description. The only difference here is that UNROLL_COUNT can go up
+dnl to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67.
+
+deflit(SQR_TOOM2_THRESHOLD_MAX, 67)
+
+ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
+`define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
+
+m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
+deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the given size
+C is small.
+C
+C The code size might look a bit excessive, but not all of it is executed so
+C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases
+C clearly apply only to those sizes; mid sizes like 10x10 only need part of
+C the unrolled addmul; and big sizes like 40x40 that do use the full
+C unrolling will least be making good use of it, because 40x40 will take
+C something like 7000 cycles.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %edx
+
+ movl PARAM_SRC, %eax
+
+ cmpl $2, %edx
+ movl PARAM_DST, %ecx
+ je L(two_limbs)
+
+ movl (%eax), %eax
+ ja L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+ C eax src limb
+ C ebx
+ C ecx dst
+ C edx
+
+ mull %eax
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(two_limbs):
+ C eax src
+ C ebx
+ C ecx dst
+ C edx
+
+defframe(SAVE_ESI, -4)
+defframe(SAVE_EBX, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(`STACK_SPACE',16)
+
+ subl $STACK_SPACE, %esp
+deflit(`FRAME',STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl %eax, %esi
+ movl (%eax), %eax
+
+ mull %eax C src[0]^2
+
+ movl %eax, (%ecx) C dst[0]
+ movl 4(%esi), %eax
+
+ movl %ebx, SAVE_EBX
+ movl %edx, %ebx C dst[1]
+
+ mull %eax C src[1]^2
+
+ movl %edi, SAVE_EDI
+ movl %eax, %edi C dst[2]
+ movl (%esi), %eax
+
+ movl %ebp, SAVE_EBP
+ movl %edx, %ebp C dst[3]
+
+ mull 4(%esi) C src[0]*src[1]
+
+ addl %eax, %ebx
+ movl SAVE_ESI, %esi
+
+ adcl %edx, %edi
+
+ adcl $0, %ebp
+ addl %ebx, %eax
+ movl SAVE_EBX, %ebx
+
+ adcl %edi, %edx
+ movl SAVE_EDI, %edi
+
+ adcl $0, %ebp
+
+ movl %eax, 4(%ecx)
+
+ movl %ebp, 12(%ecx)
+ movl SAVE_EBP, %ebp
+
+ movl %edx, 8(%ecx)
+ addl $FRAME, %esp
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+L(three_or_more):
+ C eax src low limb
+ C ebx
+ C ecx dst
+ C edx size
+deflit(`FRAME',0)
+
+ pushl %esi defframe_pushl(`SAVE_ESI')
+ cmpl $4, %edx
+
+ movl PARAM_SRC, %esi
+ jae L(four_or_more)
+
+
+C -----------------------------------------------------------------------------
+C three limbs
+
+ C eax src low limb
+ C ebx
+ C ecx dst
+ C edx
+ C esi src
+ C edi
+ C ebp
+
+ pushl %ebp defframe_pushl(`SAVE_EBP')
+ pushl %edi defframe_pushl(`SAVE_EDI')
+
+ mull %eax C src[0] ^ 2
+
+ movl %eax, (%ecx)
+ movl %edx, 4(%ecx)
+
+ movl 4(%esi), %eax
+ xorl %ebp, %ebp
+
+ mull %eax C src[1] ^ 2
+
+ movl %eax, 8(%ecx)
+ movl %edx, 12(%ecx)
+ movl 8(%esi), %eax
+
+ pushl %ebx defframe_pushl(`SAVE_EBX')
+
+ mull %eax C src[2] ^ 2
+
+ movl %eax, 16(%ecx)
+ movl %edx, 20(%ecx)
+
+ movl (%esi), %eax
+
+ mull 4(%esi) C src[0] * src[1]
+
+ movl %eax, %ebx
+ movl %edx, %edi
+
+ movl (%esi), %eax
+
+ mull 8(%esi) C src[0] * src[2]
+
+ addl %eax, %edi
+ movl %edx, %ebp
+
+ adcl $0, %ebp
+ movl 4(%esi), %eax
+
+ mull 8(%esi) C src[1] * src[2]
+
+ xorl %esi, %esi
+ addl %eax, %ebp
+
+ C eax
+ C ebx dst[1]
+ C ecx dst
+ C edx dst[4]
+ C esi zero, will be dst[5]
+ C edi dst[2]
+ C ebp dst[3]
+
+ adcl $0, %edx
+ addl %ebx, %ebx
+
+ adcl %edi, %edi
+
+ adcl %ebp, %ebp
+
+ adcl %edx, %edx
+ movl 4(%ecx), %eax
+
+ adcl $0, %esi
+ addl %ebx, %eax
+
+ movl %eax, 4(%ecx)
+ movl 8(%ecx), %eax
+
+ adcl %edi, %eax
+ movl 12(%ecx), %ebx
+
+ adcl %ebp, %ebx
+ movl 16(%ecx), %edi
+
+ movl %eax, 8(%ecx)
+ movl SAVE_EBP, %ebp
+
+ movl %ebx, 12(%ecx)
+ movl SAVE_EBX, %ebx
+
+ adcl %edx, %edi
+ movl 20(%ecx), %eax
+
+ movl %edi, 16(%ecx)
+ movl SAVE_EDI, %edi
+
+ adcl %esi, %eax C no carry out of this
+ movl SAVE_ESI, %esi
+
+ movl %eax, 20(%ecx)
+ addl $FRAME, %esp
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+defframe(VAR_COUNTER,-20)
+defframe(VAR_JMP, -24)
+deflit(`STACK_SPACE',24)
+
+L(four_or_more):
+ C eax src low limb
+ C ebx
+ C ecx
+ C edx size
+ C esi src
+ C edi
+ C ebp
+deflit(`FRAME',4) dnl %esi already pushed
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+ subl $STACK_SPACE-FRAME, %esp
+deflit(`FRAME',STACK_SPACE)
+ movl $1, %ecx
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl %ebx, SAVE_EBX
+ subl %edx, %ecx C -(size-1)
+
+ movl %ebp, SAVE_EBP
+ movl $0, %ebx C initial carry
+
+ leal (%esi,%edx,4), %esi C &src[size]
+ movl %eax, %ebp C multiplier
+
+ leal -4(%edi,%edx,4), %edi C &dst[size-1]
+
+
+C This loop runs at just over 6 c/l.
+
+L(mul_1):
+ C eax scratch
+ C ebx carry
+ C ecx counter, limbs, negative, -(size-1) to -1
+ C edx scratch
+ C esi &src[size]
+ C edi &dst[size-1]
+ C ebp multiplier
+
+ movl %ebp, %eax
+
+ mull (%esi,%ecx,4)
+
+ addl %ebx, %eax
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+ movl %eax, 4(%edi,%ecx,4)
+
+ incl %ecx
+ jnz L(mul_1)
+
+
+ movl %ebx, 4(%edi)
+
+
+C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
+C
+C The last two addmuls, which are the bottom right corner of the product
+C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
+C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
+C cases that need to be done.
+C
+C The unrolled code is the same as mpn_addmul_1(), see that routine for some
+C comments.
+C
+C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
+C
+C VAR_JMP is the computed jump into the unrolled code, stepped by one code
+C chunk each outer loop.
+
+dnl This is also hard-coded in the address calculation below.
+deflit(CODE_BYTES_PER_LIMB, 15)
+
+dnl With &src[size] and &dst[size-1] pointers, the displacements in the
+dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
+dnl that an offset must be added to them.
+deflit(OFFSET,
+ifelse(eval(UNROLL_COUNT>32),1,
+eval((UNROLL_COUNT-32)*4),
+0))
+
+ C eax
+ C ebx carry
+ C ecx
+ C edx
+ C esi &src[size]
+ C edi &dst[size-1]
+ C ebp
+
+ movl PARAM_SIZE, %ecx
+
+ subl $4, %ecx
+ jz L(corner)
+
+ movl %ecx, %edx
+ negl %ecx
+
+ shll $4, %ecx
+ifelse(OFFSET,0,,`subl $OFFSET, %esi')
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
+')
+ negl %edx
+
+ifelse(OFFSET,0,,`subl $OFFSET, %edi')
+
+ C The calculated jump mustn't be before the start of the available
+ C code. This is the limit that UNROLL_COUNT puts on the src operand
+ C size, but checked here using the jump address directly.
+
+ ASSERT(ae,
+ `movl_text_address( L(unroll_inner_start), %eax)
+ cmpl %eax, %ecx')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll_outer_top):
+ C eax
+ C ebx high limb to store
+ C ecx VAR_JMP
+ C edx VAR_COUNTER, limbs, negative
+ C esi &src[size], constant
+ C edi dst ptr, second highest limb of last addmul
+ C ebp
+
+ movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier
+ movl %edx, VAR_COUNTER
+
+ movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand
+
+ mull %ebp
+
+define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
+
+ testb $1, %cl
+
+ movl %edx, %ebx C high carry
+ leal 4(%edi), %edi
+
+ movl %ecx, %edx C jump
+
+ movl %eax, %ecx C low carry
+ leal CODE_BYTES_PER_LIMB(%edx), %edx
+
+ cmovX( %ebx, %ecx) C high carry reverse
+ cmovX( %eax, %ebx) C low carry reverse
+ movl %edx, VAR_JMP
+ jmp *%edx
+
+
+ C Must be on an even address here so the low bit of the jump address
+ C will indicate which way around ecx/ebx should start.
+
+ ALIGN(2)
+
+L(unroll_inner_start):
+ C eax scratch
+ C ebx carry high
+ C ecx carry low
+ C edx scratch
+ C esi src pointer
+ C edi dst pointer
+ C ebp multiplier
+ C
+ C 15 code bytes each limb
+ C ecx/ebx reversed on each chunk
+
+forloop(`i', UNROLL_COUNT, 1, `
+ deflit(`disp_src', eval(-i*4 + OFFSET))
+ deflit(`disp_dst', eval(disp_src))
+
+ m4_assert(`disp_src>=-128 && disp_src<128')
+ m4_assert(`disp_dst>=-128 && disp_dst<128')
+
+ifelse(eval(i%2),0,`
+Zdisp( movl, disp_src,(%esi), %eax)
+ mull %ebp
+Zdisp( addl, %ebx, disp_dst,(%edi))
+ adcl %eax, %ecx
+ movl %edx, %ebx
+ adcl $0, %ebx
+',`
+ dnl this one comes out last
+Zdisp( movl, disp_src,(%esi), %eax)
+ mull %ebp
+Zdisp( addl, %ecx, disp_dst,(%edi))
+ adcl %eax, %ebx
+ movl %edx, %ecx
+ adcl $0, %ecx
+')
+')
+L(unroll_inner_end):
+
+ addl %ebx, m4_empty_if_zero(OFFSET)(%edi)
+
+ movl VAR_COUNTER, %edx
+ adcl $0, %ecx
+
+ movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi)
+ movl VAR_JMP, %ecx
+
+ incl %edx
+ jnz L(unroll_outer_top)
+
+
+ifelse(OFFSET,0,,`
+ addl $OFFSET, %esi
+ addl $OFFSET, %edi
+')
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(corner):
+ C eax
+ C ebx
+ C ecx
+ C edx
+ C esi &src[size]
+ C edi &dst[2*size-5]
+ C ebp
+
+ movl -12(%esi), %eax
+
+ mull -8(%esi)
+
+ addl %eax, (%edi)
+ movl -12(%esi), %eax
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+
+ mull -4(%esi)
+
+ addl %eax, %ebx
+ movl -8(%esi), %eax
+
+ adcl $0, %edx
+
+ addl %ebx, 4(%edi)
+ movl $0, %ebx
+
+ adcl %edx, %ebx
+
+ mull -4(%esi)
+
+ movl PARAM_SIZE, %ecx
+ addl %ebx, %eax
+
+ adcl $0, %edx
+
+ movl %eax, 8(%edi)
+
+ movl %edx, 12(%edi)
+ movl PARAM_DST, %edi
+
+
+C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
+
+ subl $1, %ecx C size-1
+ xorl %eax, %eax C ready for final adcl, and clear carry
+
+ movl %ecx, %edx
+ movl PARAM_SRC, %esi
+
+
+L(lshift):
+ C eax
+ C ebx
+ C ecx counter, size-1 to 1
+ C edx size-1 (for later use)
+ C esi src (for later use)
+ C edi dst, incrementing
+ C ebp
+
+ rcll 4(%edi)
+ rcll 8(%edi)
+
+ leal 8(%edi), %edi
+ decl %ecx
+ jnz L(lshift)
+
+
+ adcl %eax, %eax
+
+ movl %eax, 4(%edi) C dst most significant limb
+ movl (%esi), %eax C src[0]
+
+ leal 4(%esi,%edx,4), %esi C &src[size]
+ subl %edx, %ecx C -(size-1)
+
+
+C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
+C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+
+ mull %eax
+
+ movl %eax, (%edi,%ecx,8) C dst[0]
+
+
+L(diag):
+ C eax scratch
+ C ebx scratch
+ C ecx counter, negative
+ C edx carry
+ C esi &src[size]
+ C edi dst[2*size-2]
+ C ebp
+
+ movl (%esi,%ecx,4), %eax
+ movl %edx, %ebx
+
+ mull %eax
+
+ addl %ebx, 4(%edi,%ecx,8)
+ adcl %eax, 8(%edi,%ecx,8)
+ adcl $0, %edx
+
+ incl %ecx
+ jnz L(diag)
+
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBX, %ebx
+
+ addl %edx, 4(%edi) C dst most significant limb
+
+ movl SAVE_EDI, %edi
+ movl SAVE_EBP, %ebp
+ addl $FRAME, %esp
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+ifdef(`PIC',`
+L(pic_calc):
+ addl (%esp), %ecx
+ addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx
+ addl %edx, %ecx
+ ret_internal
+')
+
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/addmul_1.asm b/gmp-6.3.0/mpn/x86/p6/sse2/addmul_1.asm
new file mode 100644
index 0000000..144b627
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/sse2/addmul_1.asm
@@ -0,0 +1,37 @@
+dnl Intel P6/SSE2 mpn_addmul_1.
+
+dnl Copyright 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Write P6 specific SSE2 code.
+
+MULFUNC_PROLOGUE(mpn_addmul_1)
+include_mpn(`x86/pentium4/sse2/addmul_1.asm')
diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/gmp-mparam.h b/gmp-6.3.0/mpn/x86/p6/sse2/gmp-mparam.h
new file mode 100644
index 0000000..a1e261b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/sse2/gmp-mparam.h
@@ -0,0 +1,200 @@
+/* Intel P6/sse2 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 1999-2003, 2008-2010 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+
+/* NOTE: In a fat binary build SQR_TOOM2_THRESHOLD here cannot be more than the
+ value in mpn/x86/p6/gmp-mparam.h. The latter is used as a hard limit in
+ mpn/x86/p6/sqr_basecase.asm. */
+
+
+/* 1867 MHz P6 model 13 */
+
+#define MOD_1_NORM_THRESHOLD 4
+#define MOD_1_UNNORM_THRESHOLD 4
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 4
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 11
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 21
+
+#define MUL_TOOM22_THRESHOLD 20
+#define MUL_TOOM33_THRESHOLD 77
+#define MUL_TOOM44_THRESHOLD 169
+#define MUL_TOOM6H_THRESHOLD 246
+#define MUL_TOOM8H_THRESHOLD 381
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 73
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 114
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 97
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 80
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 106
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 30
+#define SQR_TOOM3_THRESHOLD 101
+#define SQR_TOOM4_THRESHOLD 154
+#define SQR_TOOM6_THRESHOLD 222
+#define SQR_TOOM8_THRESHOLD 527
+
+#define MULMID_TOOM42_THRESHOLD 58
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 17
+
+#define MUL_FFT_MODF_THRESHOLD 690 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 28, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 9}, { 15, 8}, \
+ { 31, 7}, { 63, 8}, { 39, 9}, { 23, 5}, \
+ { 383, 4}, { 991, 5}, { 511, 6}, { 267, 7}, \
+ { 157, 8}, { 91, 9}, { 47, 8}, { 111, 9}, \
+ { 63, 8}, { 127, 9}, { 79,10}, { 47, 9}, \
+ { 95,11}, { 31,10}, { 63, 9}, { 135,10}, \
+ { 79, 9}, { 159,10}, { 95,11}, { 63,10}, \
+ { 143, 9}, { 287,10}, { 159,11}, { 95,10}, \
+ { 191,12}, { 63,11}, { 127,10}, { 255, 9}, \
+ { 511,10}, { 271, 9}, { 543,10}, { 287,11}, \
+ { 159,10}, { 335, 9}, { 671,11}, { 191,10}, \
+ { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \
+ { 415,11}, { 223,12}, { 127,11}, { 255,10}, \
+ { 543, 9}, { 1087,11}, { 287,10}, { 607,11}, \
+ { 319,10}, { 671,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,10}, { 831,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,10}, \
+ { 1215,12}, { 319,11}, { 671,10}, { 1343,11}, \
+ { 735,10}, { 1471,12}, { 383,11}, { 799,10}, \
+ { 1599,11}, { 863,12}, { 447,11}, { 959,13}, \
+ { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \
+ { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \
+ { 1471,13}, { 383,12}, { 831,11}, { 1727,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \
+ { 639,12}, { 1471,11}, { 2943,13}, { 767,12}, \
+ { 1727,13}, { 895,12}, { 1919,14}, { 511,13}, \
+ { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \
+ { 1407,12}, { 2815,14}, { 767,13}, { 1663,12}, \
+ { 3455,13}, { 8192,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 132
+#define MUL_FFT_THRESHOLD 7424
+
+#define SQR_FFT_MODF_THRESHOLD 565 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 472, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 25, 7}, { 13, 6}, { 27, 7}, { 15, 6}, \
+ { 31, 7}, { 17, 6}, { 35, 7}, { 27, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \
+ { 39, 9}, { 23, 8}, { 51,10}, { 15, 9}, \
+ { 31, 8}, { 63, 4}, { 1023, 8}, { 67, 9}, \
+ { 39, 5}, { 639, 4}, { 1471, 6}, { 383, 7}, \
+ { 209, 8}, { 119, 9}, { 63, 7}, { 255, 8}, \
+ { 139, 9}, { 71, 8}, { 143, 9}, { 79,10}, \
+ { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159, 8}, { 319, 9}, \
+ { 167,10}, { 95,11}, { 63,10}, { 143, 9}, \
+ { 287,10}, { 159,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127,10}, { 255, 9}, { 543, 8}, \
+ { 1087,10}, { 287, 9}, { 575,11}, { 159,10}, \
+ { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \
+ { 351, 9}, { 703,11}, { 191,10}, { 383, 9}, \
+ { 767,10}, { 399, 9}, { 799,10}, { 415, 9}, \
+ { 831,11}, { 223,12}, { 127,11}, { 255,10}, \
+ { 543, 9}, { 1087,11}, { 287,10}, { 607, 9}, \
+ { 1215,11}, { 319,10}, { 671, 9}, { 1343,11}, \
+ { 351,10}, { 703,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,10}, { 831,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1087,11}, { 607,12}, \
+ { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \
+ { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \
+ { 447,11}, { 959,13}, { 255,12}, { 511,11}, \
+ { 1087,12}, { 575,11}, { 1215,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1471,13}, { 383,12}, \
+ { 767,11}, { 1599,12}, { 831,11}, { 1727,12}, \
+ { 959,14}, { 255,13}, { 511,12}, { 1215,13}, \
+ { 639,12}, { 1471,13}, { 767,12}, { 1727,13}, \
+ { 895,12}, { 1919,14}, { 511,13}, { 1023,12}, \
+ { 2111,13}, { 1151,12}, { 2431,13}, { 1407,14}, \
+ { 767,13}, { 1663,12}, { 3455,13}, { 8192,14}, \
+ { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 146
+#define SQR_FFT_THRESHOLD 5760
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 31
+#define MULLO_MUL_N_THRESHOLD 13463
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 100
+#define SQRLO_SQR_THRESHOLD 9236
+
+#define DC_DIV_QR_THRESHOLD 25
+#define DC_DIVAPPR_Q_THRESHOLD 55
+#define DC_BDIV_QR_THRESHOLD 60
+#define DC_BDIV_Q_THRESHOLD 132
+
+#define INV_MULMOD_BNM1_THRESHOLD 38
+#define INV_NEWTON_THRESHOLD 65
+#define INV_APPR_THRESHOLD 65
+
+#define BINV_NEWTON_THRESHOLD 252
+#define REDC_1_TO_REDC_N_THRESHOLD 62
+
+#define MU_DIV_QR_THRESHOLD 1164
+#define MU_DIVAPPR_Q_THRESHOLD 748
+#define MUPI_DIV_QR_THRESHOLD 38
+#define MU_BDIV_QR_THRESHOLD 1360
+#define MU_BDIV_Q_THRESHOLD 1470
+
+#define POWM_SEC_TABLE 2,23,258,879,2246
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 25
+#define SET_STR_DC_THRESHOLD 582
+#define SET_STR_PRECOMPUTE_THRESHOLD 1118
+
+#define FAC_DSC_THRESHOLD 178
+#define FAC_ODD_THRESHOLD 34
+
+#define MATRIX22_STRASSEN_THRESHOLD 17
+#define HGCD_THRESHOLD 69
+#define HGCD_APPR_THRESHOLD 112
+#define HGCD_REDUCE_THRESHOLD 3389
+#define GCD_DC_THRESHOLD 386
+#define GCDEXT_DC_THRESHOLD 303
+#define JACOBI_BASE_METHOD 1
diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_1.asm b/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_1.asm
new file mode 100644
index 0000000..8b7b7ad
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_1.asm
@@ -0,0 +1,34 @@
+dnl Intel P6/SSE2 mpn_mod_1_1.
+
+dnl Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_1_1p)
+include_mpn(`x86/pentium4/sse2/mod_1_1.asm')
diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_4.asm b/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_4.asm
new file mode 100644
index 0000000..49c96c6
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/sse2/mod_1_4.asm
@@ -0,0 +1,34 @@
+dnl Intel P6/SSE2 mpn_mod_1_4.
+
+dnl Copyright 2009, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_mod_1s_4p)
+include_mpn(`x86/pentium4/sse2/mod_1_4.asm')
diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/mul_1.asm b/gmp-6.3.0/mpn/x86/p6/sse2/mul_1.asm
new file mode 100644
index 0000000..50e5b69
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/sse2/mul_1.asm
@@ -0,0 +1,38 @@
+dnl Intel P6/SSE2 mpn_mul_1.
+
+dnl Copyright 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO
+C * Write P6 specific SSE2 code. It should reach 3 c/l.
+C The Pentium4 code runs at 4.2 c/l.
+
+MULFUNC_PROLOGUE(mpn_mul_1)
+include_mpn(`x86/pentium4/sse2/mul_1.asm')
diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/mul_basecase.asm b/gmp-6.3.0/mpn/x86/p6/sse2/mul_basecase.asm
new file mode 100644
index 0000000..4687625
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/sse2/mul_basecase.asm
@@ -0,0 +1,35 @@
+dnl Intel P6/SSE2 mpn_mul_basecase.
+
+dnl Copyright 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_mul_basecase)
+include_mpn(`x86/pentium4/sse2/mul_basecase.asm')
diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/popcount.asm b/gmp-6.3.0/mpn/x86/p6/sse2/popcount.asm
new file mode 100644
index 0000000..4c02b93
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/sse2/popcount.asm
@@ -0,0 +1,35 @@
+dnl Intel P6/SSE2 mpn_popcount -- population count.
+
+dnl Copyright 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86/pentium4/sse2/popcount.asm')
diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/sqr_basecase.asm b/gmp-6.3.0/mpn/x86/p6/sse2/sqr_basecase.asm
new file mode 100644
index 0000000..76b574b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/sse2/sqr_basecase.asm
@@ -0,0 +1,35 @@
+dnl Intel P6/SSE2 mpn_sqr_basecase.
+
+dnl Copyright 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_sqr_basecase)
+include_mpn(`x86/pentium4/sse2/sqr_basecase.asm')
diff --git a/gmp-6.3.0/mpn/x86/p6/sse2/submul_1.asm b/gmp-6.3.0/mpn/x86/p6/sse2/submul_1.asm
new file mode 100644
index 0000000..69d940d
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/p6/sse2/submul_1.asm
@@ -0,0 +1,35 @@
+dnl Intel P6/SSE2 mpn_submul_1.
+
+dnl Copyright 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+MULFUNC_PROLOGUE(mpn_submul_1)
+include_mpn(`x86/pentium4/sse2/submul_1.asm')