aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86/pentium4/sse2
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/x86/pentium4/sse2')
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm101
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm108
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm189
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm141
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm234
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm95
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm114
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm216
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm645
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h213
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm166
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm269
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm175
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm175
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm164
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm662
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm281
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm126
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm705
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm119
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm182
21 files changed, 5080 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm
new file mode 100644
index 0000000..8e2380e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/add_n.asm
@@ -0,0 +1,101 @@
+dnl Intel Pentium-4 mpn_add_n -- mpn addition.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C dst!=src1,2 dst==src1 dst==src2
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 4 6 6
+C P4 model 3-4 (Prescott) 4.25 7.5 7.5
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_add_nc)
+deflit(`FRAME',0)
+ movd PARAM_CARRY, %mm0
+ jmp L(start_nc)
+EPILOGUE()
+
+ ALIGN(8)
+PROLOGUE(mpn_add_n)
+deflit(`FRAME',0)
+ pxor %mm0, %mm0
+L(start_nc):
+ mov PARAM_SRC1, %eax
+ mov %ebx, SAVE_EBX
+ mov PARAM_SRC2, %ebx
+ mov PARAM_DST, %edx
+ mov PARAM_SIZE, %ecx
+
+ lea (%eax,%ecx,4), %eax C src1 end
+ lea (%ebx,%ecx,4), %ebx C src2 end
+ lea (%edx,%ecx,4), %edx C dst end
+ neg %ecx C -size
+
+L(top):
+ C eax src1 end
+ C ebx src2 end
+ C ecx counter, limbs, negative
+ C edx dst end
+ C mm0 carry bit
+
+ movd (%eax,%ecx,4), %mm1
+ movd (%ebx,%ecx,4), %mm2
+ paddq %mm2, %mm1
+
+ paddq %mm1, %mm0
+ movd %mm0, (%edx,%ecx,4)
+
+ psrlq $32, %mm0
+
+ add $1, %ecx
+ jnz L(top)
+
+ movd %mm0, %eax
+ mov SAVE_EBX, %ebx
+ emms
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm
new file mode 100644
index 0000000..93b63b2
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/addlsh1_n.asm
@@ -0,0 +1,108 @@
+dnl Intel Pentium-4 mpn_addlsh1_n -- mpn x+2*y.
+
+dnl Copyright 2001-2004, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C dst!=src1,2 dst==src1 dst==src2
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 4.25 6 6
+C P4 model 3-4 (Prescott) 5 8.5 8.5
+
+C The slightly strange combination of indexing and pointer incrementing
+C that's used seems to work best. Not sure why, but %ecx,4 with src1 and/or
+C src2 is a slowdown.
+C
+C The dependent chain is simply the paddq of x+2*y to the previous carry,
+C then psrlq to get the new carry. That makes 4 c/l the target speed, which
+C is almost achieved for separate src/dst but when src==dst the write
+C combining anomalies slow it down.
+
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_addlsh1_n)
+deflit(`FRAME',0)
+
+ mov PARAM_SRC1, %eax
+ mov %ebx, SAVE_EBX
+
+ mov PARAM_SRC2, %ebx
+ pxor %mm0, %mm0 C initial carry
+
+ mov PARAM_DST, %edx
+
+ mov PARAM_SIZE, %ecx
+
+ lea (%edx,%ecx,4), %edx C dst end
+ neg %ecx C -size
+
+L(top):
+ C eax src1 end
+ C ebx src2 end
+ C ecx counter, limbs, negative
+ C edx dst end
+ C mm0 carry
+
+ movd (%ebx), %mm2
+ movd (%eax), %mm1
+ psrlq $32, %mm0
+ lea 4(%eax), %eax
+ lea 4(%ebx), %ebx
+
+ psllq $1, %mm2
+ paddq %mm2, %mm1
+
+ paddq %mm1, %mm0
+
+ movd %mm0, (%edx,%ecx,4)
+ add $1, %ecx
+ jnz L(top)
+
+
+ psrlq $32, %mm0
+ mov SAVE_EBX, %ebx
+ movd %mm0, %eax
+ emms
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm
new file mode 100644
index 0000000..7810207
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/addmul_1.asm
@@ -0,0 +1,189 @@
+dnl mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) 5.24
+C P6 model 13 (Dothan) 5.24
+C P4 model 0-1 (Willamette) 5
+C P4 model 2 (Northwood) 5
+C P4 model 3-4 (Prescott) 5
+
+C TODO:
+C * Tweak eax/edx offsets in loop as to save some lea's
+C * Perhaps software pipeline small-case code
+
+C INPUT PARAMETERS
+C rp sp + 4
+C up sp + 8
+C n sp + 12
+C v0 sp + 16
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_addmul_1)
+ pxor %mm6, %mm6
+L(ent): mov 4(%esp), %edx
+ mov 8(%esp), %eax
+ mov 12(%esp), %ecx
+ movd 16(%esp), %mm7
+ cmp $4, %ecx
+ jnc L(big)
+
+L(lp0): movd (%eax), %mm0
+ lea 4(%eax), %eax
+ movd (%edx), %mm4
+ lea 4(%edx), %edx
+ pmuludq %mm7, %mm0
+ paddq %mm0, %mm4
+ paddq %mm4, %mm6
+ movd %mm6, -4(%edx)
+ psrlq $32, %mm6
+ dec %ecx
+ jnz L(lp0)
+ movd %mm6, %eax
+ emms
+ ret
+
+L(big): and $3, %ecx
+ je L(0)
+ cmp $2, %ecx
+ jc L(1)
+ je L(2)
+ jmp L(3) C FIXME: one case should fall through
+
+L(0): movd (%eax), %mm3
+ sub 12(%esp), %ecx C loop count
+ lea -16(%eax), %eax
+ lea -12(%edx), %edx
+ pmuludq %mm7, %mm3
+ movd 20(%eax), %mm0
+ movd 12(%edx), %mm5
+ pmuludq %mm7, %mm0
+ movd 24(%eax), %mm1
+ paddq %mm3, %mm5
+ movd 16(%edx), %mm4
+ jmp L(00)
+
+L(1): movd (%eax), %mm2
+ sub 12(%esp), %ecx
+ lea -12(%eax), %eax
+ lea -8(%edx), %edx
+ movd 8(%edx), %mm4
+ pmuludq %mm7, %mm2
+ movd 16(%eax), %mm3
+ pmuludq %mm7, %mm3
+ movd 20(%eax), %mm0
+ paddq %mm2, %mm4
+ movd 12(%edx), %mm5
+ jmp L(01)
+
+L(2): movd (%eax), %mm1
+ sub 12(%esp), %ecx
+ lea -8(%eax), %eax
+ lea -4(%edx), %edx
+ pmuludq %mm7, %mm1
+ movd 12(%eax), %mm2
+ movd 4(%edx), %mm5
+ pmuludq %mm7, %mm2
+ movd 16(%eax), %mm3
+ paddq %mm1, %mm5
+ movd 8(%edx), %mm4
+ jmp L(10)
+
+L(3): movd (%eax), %mm0
+ sub 12(%esp), %ecx
+ lea -4(%eax), %eax
+ pmuludq %mm7, %mm0
+ movd 8(%eax), %mm1
+ movd (%edx), %mm4
+ pmuludq %mm7, %mm1
+ movd 12(%eax), %mm2
+ paddq %mm0, %mm4
+ movd 4(%edx), %mm5
+
+ ALIGN(16)
+L(top): pmuludq %mm7, %mm2
+ paddq %mm4, %mm6
+ movd 16(%eax), %mm3
+ paddq %mm1, %mm5
+ movd 8(%edx), %mm4
+ movd %mm6, 0(%edx)
+ psrlq $32, %mm6
+L(10): pmuludq %mm7, %mm3
+ paddq %mm5, %mm6
+ movd 20(%eax), %mm0
+ paddq %mm2, %mm4
+ movd 12(%edx), %mm5
+ movd %mm6, 4(%edx)
+ psrlq $32, %mm6
+L(01): pmuludq %mm7, %mm0
+ paddq %mm4, %mm6
+ movd 24(%eax), %mm1
+ paddq %mm3, %mm5
+ movd 16(%edx), %mm4
+ movd %mm6, 8(%edx)
+ psrlq $32, %mm6
+L(00): pmuludq %mm7, %mm1
+ paddq %mm5, %mm6
+ movd 28(%eax), %mm2
+ paddq %mm0, %mm4
+ movd 20(%edx), %mm5
+ movd %mm6, 12(%edx)
+ psrlq $32, %mm6
+ lea 16(%eax), %eax
+ lea 16(%edx), %edx
+ add $4, %ecx
+ jnz L(top)
+
+L(end): pmuludq %mm7, %mm2
+ paddq %mm4, %mm6
+ paddq %mm1, %mm5
+ movd 8(%edx), %mm4
+ movd %mm6, 0(%edx)
+ psrlq $32, %mm6
+ paddq %mm5, %mm6
+ paddq %mm2, %mm4
+ movd %mm6, 4(%edx)
+ psrlq $32, %mm6
+ paddq %mm4, %mm6
+ movd %mm6, 8(%edx)
+ psrlq $32, %mm6
+ movd %mm6, %eax
+ emms
+ ret
+EPILOGUE()
+PROLOGUE(mpn_addmul_1c)
+ movd 20(%esp), %mm6
+ jmp L(ent)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm
new file mode 100644
index 0000000..354300e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_dbm1c.asm
@@ -0,0 +1,141 @@
+dnl Intel Atom mpn_bdiv_dbm1.
+
+dnl Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C cycles/limb
+C P5 -
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) 9.75
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood) 8.25
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 8
+C AMD K6 -
+C AMD K7 -
+C AMD K8
+C AMD K10
+
+C TODO: This code was optimised for atom-32, consider moving it back to atom
+C dir(atom currently grabs this code), and write a 4-way version(7c/l).
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_MUL, 16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_RP,`PARAM_MUL')
+define(SAVE_UP,`PARAM_SIZE')
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`n', `%ecx')
+define(`reg', `%edx')
+define(`cy', `%eax') C contains the return value
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(mpn_bdiv_dbm1c)
+ mov PARAM_SIZE, n C size
+ mov up, SAVE_UP
+ mov PARAM_SRC, up
+ movd PARAM_MUL, %mm7
+ mov rp, SAVE_RP
+ mov PARAM_DST, rp
+
+ movd (up), %mm0
+ pmuludq %mm7, %mm0
+ shr n
+ mov PARAM_CARRY, cy
+ jz L(eq1)
+
+ movd 4(up), %mm1
+ jc L(odd)
+
+ lea 4(up), up
+ pmuludq %mm7, %mm1
+ movd %mm0, reg
+ psrlq $32, %mm0
+ sub reg, cy
+ movd %mm0, reg
+ movq %mm1, %mm0
+ dec n
+ mov cy, (rp)
+ lea 4(rp), rp
+ jz L(end)
+
+C ALIGN(16)
+L(top): movd 4(up), %mm1
+ sbb reg, cy
+L(odd): movd %mm0, reg
+ psrlq $32, %mm0
+ pmuludq %mm7, %mm1
+ sub reg, cy
+ lea 8(up), up
+ movd %mm0, reg
+ movd (up), %mm0
+ mov cy, (rp)
+ sbb reg, cy
+ movd %mm1, reg
+ psrlq $32, %mm1
+ sub reg, cy
+ movd %mm1, reg
+ pmuludq %mm7, %mm0
+ dec n
+ mov cy, 4(rp)
+ lea 8(rp), rp
+ jnz L(top)
+
+L(end): sbb reg, cy
+
+L(eq1): movd %mm0, reg
+ psrlq $32, %mm0
+ mov SAVE_UP, up
+ sub reg, cy
+ movd %mm0, reg
+ emms
+ mov cy, (rp)
+ sbb reg, cy
+
+ mov SAVE_RP, rp
+ ret
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm
new file mode 100644
index 0000000..d5008f4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/bdiv_q_1.asm
@@ -0,0 +1,234 @@
+dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato.
+
+dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 19.0 cycles/limb
+
+C Pairs of movd's are used to avoid unaligned loads. Despite the loads not
+C being on the dependent chain and there being plenty of cycles available,
+C using an unaligned movq on every second iteration measured about 23 c/l.
+C
+
+defframe(PARAM_SHIFT, 24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse, int shift)
+ ALIGN(32)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %edx
+
+ movl PARAM_SRC, %eax
+
+ movl PARAM_DIVISOR, %ecx
+
+ movd %ecx, %mm6
+ movl PARAM_SHIFT, %ecx
+
+ movd %ecx, %mm7 C shift
+
+ C
+
+ movl PARAM_INVERSE, %ecx
+ movd %ecx, %mm5 C inv
+
+ movl PARAM_DST, %ecx
+ pxor %mm1, %mm1 C initial carry limb
+ pxor %mm0, %mm0 C initial carry bit
+
+ subl $1, %edx
+ jz L(done)
+
+ pcmpeqd %mm4, %mm4
+ psrlq $32, %mm4 C 0x00000000FFFFFFFF
+
+C The dependent chain here is as follows.
+C
+C latency
+C psubq s = (src-cbit) - climb 2
+C pmuludq q = s*inverse 8
+C pmuludq prod = q*divisor 8
+C psrlq climb = high(prod) 2
+C --
+C 20
+C
+C Yet the loop measures 19.0 c/l, so obviously there's something gained
+C there over a straight reading of the chip documentation.
+
+L(top):
+ C eax src, incrementing
+ C ebx
+ C ecx dst, incrementing
+ C edx counter, size-1 iterations
+ C
+ C mm0 carry bit
+ C mm1 carry limb
+ C mm4 0x00000000FFFFFFFF
+ C mm5 inverse
+ C mm6 divisor
+ C mm7 shift
+
+ movd (%eax), %mm2
+ movd 4(%eax), %mm3
+ addl $4, %eax
+ punpckldq %mm3, %mm2
+
+ psrlq %mm7, %mm2
+ pand %mm4, %mm2 C src
+ psubq %mm0, %mm2 C src - cbit
+
+ psubq %mm1, %mm2 C src - cbit - climb
+ movq %mm2, %mm0
+ psrlq $63, %mm0 C new cbit
+
+ pmuludq %mm5, %mm2 C s*inverse
+ movd %mm2, (%ecx) C q
+ addl $4, %ecx
+
+ movq %mm6, %mm1
+ pmuludq %mm2, %mm1 C q*divisor
+ psrlq $32, %mm1 C new climb
+
+L(entry):
+ subl $1, %edx
+ jnz L(top)
+
+L(done):
+ movd (%eax), %mm2
+ psrlq %mm7, %mm2 C src
+ psubq %mm0, %mm2 C src - cbit
+
+ psubq %mm1, %mm2 C src - cbit - climb
+
+ pmuludq %mm5, %mm2 C s*inverse
+ movd %mm2, (%ecx) C q
+
+ emms
+ ret
+
+EPILOGUE()
+
+ ALIGN(16)
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %edx
+
+ movl PARAM_DIVISOR, %ecx
+
+ C eax src
+ C ebx
+ C ecx divisor
+ C edx size-1
+
+ movl %ecx, %eax
+ bsfl %ecx, %ecx C trailing twos
+
+ shrl %cl, %eax C d = divisor without twos
+ movd %eax, %mm6
+ movd %ecx, %mm7 C shift
+
+ shrl %eax C d/2
+
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %ecx)
+ movzbl (%eax,%ecx), %eax C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ C
+
+ movd %eax, %mm5 C inv
+
+ movd %eax, %mm0 C inv
+
+ pmuludq %mm5, %mm5 C inv*inv
+
+ C
+
+ pmuludq %mm6, %mm5 C inv*inv*d
+ paddd %mm0, %mm0 C 2*inv
+
+ C
+
+ psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d
+ pxor %mm5, %mm5
+
+ paddd %mm0, %mm5
+ pmuludq %mm0, %mm0 C inv*inv
+
+ pcmpeqd %mm4, %mm4
+ psrlq $32, %mm4 C 0x00000000FFFFFFFF
+
+ C
+
+ pmuludq %mm6, %mm0 C inv*inv*d
+ paddd %mm5, %mm5 C 2*inv
+
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %ecx
+ pxor %mm1, %mm1 C initial carry limb
+
+ C
+
+ psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ movq %mm6, %mm0
+ pmuludq %mm5, %mm0
+ movd %mm0, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ pxor %mm0, %mm0 C initial carry bit
+ jmp L(entry)
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm
new file mode 100644
index 0000000..b3f3474
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_add_n.asm
@@ -0,0 +1,95 @@
+dnl Intel Pentium-4 mpn_cnd_add_n -- mpn addition.
+
+dnl Copyright 2001, 2002, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 4.67
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 5
+C P4 model 3-4 (Prescott) 5.25
+
+defframe(PARAM_SIZE, 20)
+defframe(PARAM_SRC2, 16)
+defframe(PARAM_SRC1, 12)
+defframe(PARAM_DST, 8)
+defframe(PARAM_CND, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+define(`cnd', `%mm3')
+
+ TEXT
+ ALIGN(8)
+
+ ALIGN(8)
+PROLOGUE(mpn_cnd_add_n)
+deflit(`FRAME',0)
+ pxor %mm0, %mm0
+
+ mov PARAM_CND, %eax
+ neg %eax
+ sbb %eax, %eax
+ movd %eax, cnd
+
+ mov PARAM_SRC1, %eax
+ mov %ebx, SAVE_EBX
+ mov PARAM_SRC2, %ebx
+ mov PARAM_DST, %edx
+ mov PARAM_SIZE, %ecx
+
+ lea (%eax,%ecx,4), %eax C src1 end
+ lea (%ebx,%ecx,4), %ebx C src2 end
+ lea (%edx,%ecx,4), %edx C dst end
+ neg %ecx C -size
+
+L(top): movd (%ebx,%ecx,4), %mm2
+ movd (%eax,%ecx,4), %mm1
+ pand cnd, %mm2
+ paddq %mm2, %mm1
+
+ paddq %mm1, %mm0
+ movd %mm0, (%edx,%ecx,4)
+
+ psrlq $32, %mm0
+
+ add $1, %ecx
+ jnz L(top)
+
+ movd %mm0, %eax
+ mov SAVE_EBX, %ebx
+ emms
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm
new file mode 100644
index 0000000..339a23e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/cnd_sub_n.asm
@@ -0,0 +1,114 @@
+dnl Intel Pentium-4 mpn_cnd_sub_n -- mpn subtraction.
+
+dnl Copyright 2001, 2002, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 4.67
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 5
+C P4 model 3-4 (Prescott) 5.25
+
+defframe(PARAM_SIZE, 20)
+defframe(PARAM_SRC2, 16)
+defframe(PARAM_SRC1, 12)
+defframe(PARAM_DST, 8)
+defframe(PARAM_CND, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+define(`cnd', `%mm3')
+
+ TEXT
+ ALIGN(8)
+
+ ALIGN(8)
+PROLOGUE(mpn_cnd_sub_n)
+deflit(`FRAME',0)
+ pxor %mm0, %mm0
+
+ mov PARAM_CND, %eax
+ neg %eax
+ sbb %eax, %eax
+ movd %eax, cnd
+
+ mov PARAM_SRC1, %eax
+ mov %ebx, SAVE_EBX
+ mov PARAM_SRC2, %ebx
+ mov PARAM_DST, %edx
+ mov PARAM_SIZE, %ecx
+
+ lea (%eax,%ecx,4), %eax C src1 end
+ lea (%ebx,%ecx,4), %ebx C src2 end
+ lea (%edx,%ecx,4), %edx C dst end
+ neg %ecx C -size
+
+L(top): movd (%ebx,%ecx,4), %mm2
+ movd (%eax,%ecx,4), %mm1
+ pand cnd, %mm2
+ psubq %mm2, %mm1
+
+ psubq %mm0, %mm1
+ movd %mm1, (%edx,%ecx,4)
+
+ psrlq $63, %mm1
+
+ add $1, %ecx
+ jz L(done_mm1)
+
+ movd (%ebx,%ecx,4), %mm2
+ movd (%eax,%ecx,4), %mm0
+ pand cnd, %mm2
+ psubq %mm2, %mm0
+
+ psubq %mm1, %mm0
+ movd %mm0, (%edx,%ecx,4)
+
+ psrlq $63, %mm0
+
+ add $1, %ecx
+ jnz L(top)
+
+ movd %mm0, %eax
+ mov SAVE_EBX, %ebx
+ emms
+ ret
+
+L(done_mm1):
+ movd %mm1, %eax
+ mov SAVE_EBX, %ebx
+ emms
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm
new file mode 100644
index 0000000..0ceef5b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/dive_1.asm
@@ -0,0 +1,216 @@
+dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 19.0 cycles/limb
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+C Pairs of movd's are used to avoid unaligned loads. Despite the loads not
+C being on the dependent chain and there being plenty of cycles available,
+C using an unaligned movq on every second iteration measured about 23 c/l.
+C
+C Using divl for size==1 seems a touch quicker than mul-by-inverse. The mul
+C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of
+C that might be hidden by out-of-order execution, whereas divl is around 60.
+C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul
+C faster.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %edx
+
+ movl PARAM_SRC, %eax
+
+ movl PARAM_DIVISOR, %ecx
+ subl $1, %edx
+ jnz L(two_or_more)
+
+ movl (%eax), %eax
+ xorl %edx, %edx
+
+ divl %ecx
+ movl PARAM_DST, %ecx
+
+ movl %eax, (%ecx)
+ ret
+
+
+L(two_or_more):
+ C eax src
+ C ebx
+ C ecx divisor
+ C edx size-1
+
+ movl %ecx, %eax
+ bsfl %ecx, %ecx C trailing twos
+
+ shrl %cl, %eax C d = divisor without twos
+ movd %eax, %mm6
+ movd %ecx, %mm7 C shift
+
+ shrl %eax C d/2
+
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %ecx)
+ movzbl (%eax,%ecx), %eax C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ C
+
+ movd %eax, %mm5 C inv
+
+ movd %eax, %mm0 C inv
+
+ pmuludq %mm5, %mm5 C inv*inv
+
+ C
+
+ pmuludq %mm6, %mm5 C inv*inv*d
+ paddd %mm0, %mm0 C 2*inv
+
+ C
+
+ psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d
+ pxor %mm5, %mm5
+
+ paddd %mm0, %mm5
+ pmuludq %mm0, %mm0 C inv*inv
+
+ pcmpeqd %mm4, %mm4
+ psrlq $32, %mm4 C 0x00000000FFFFFFFF
+
+ C
+
+ pmuludq %mm6, %mm0 C inv*inv*d
+ paddd %mm5, %mm5 C 2*inv
+
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %ecx
+ pxor %mm1, %mm1 C initial carry limb
+
+ C
+
+ psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ movq %mm6, %mm0
+ pmuludq %mm5, %mm0
+ movd %mm0, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ pxor %mm0, %mm0 C initial carry bit
+
+
+C The dependent chain here is as follows.
+C
+C latency
+C psubq s = (src-cbit) - climb 2
+C pmuludq q = s*inverse 8
+C pmuludq prod = q*divisor 8
+C psrlq climb = high(prod) 2
+C --
+C 20
+C
+C Yet the loop measures 19.0 c/l, so obviously there's something gained
+C there over a straight reading of the chip documentation.
+
+L(top):
+ C eax src, incrementing
+ C ebx
+ C ecx dst, incrementing
+ C edx counter, size-1 iterations
+ C
+ C mm0 carry bit
+ C mm1 carry limb
+ C mm4 0x00000000FFFFFFFF
+ C mm5 inverse
+ C mm6 divisor
+ C mm7 shift
+
+ movd (%eax), %mm2
+ movd 4(%eax), %mm3
+ addl $4, %eax
+ punpckldq %mm3, %mm2
+
+ psrlq %mm7, %mm2
+ pand %mm4, %mm2 C src
+ psubq %mm0, %mm2 C src - cbit
+
+ psubq %mm1, %mm2 C src - cbit - climb
+ movq %mm2, %mm0
+ psrlq $63, %mm0 C new cbit
+
+ pmuludq %mm5, %mm2 C s*inverse
+ movd %mm2, (%ecx) C q
+ addl $4, %ecx
+
+ movq %mm6, %mm1
+ pmuludq %mm2, %mm1 C q*divisor
+ psrlq $32, %mm1 C new climb
+
+ subl $1, %edx
+ jnz L(top)
+
+
+L(done):
+ movd (%eax), %mm2
+ psrlq %mm7, %mm2 C src
+ psubq %mm0, %mm2 C src - cbit
+
+ psubq %mm1, %mm2 C src - cbit - climb
+
+ pmuludq %mm5, %mm2 C s*inverse
+ movd %mm2, (%ecx) C q
+
+ emms
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm
new file mode 100644
index 0000000..0146fab
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm
@@ -0,0 +1,645 @@
+dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
+
+dnl Copyright 1999-2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t inverse,
+C unsigned shift);
+C
+C Algorithm:
+C
+C The method and nomenclature follow part 8 of "Division by Invariant
+C Integers using Multiplication" by Granlund and Montgomery, reference in
+C gmp.texi.
+C
+C "m" is written for what is m' in the paper, and "d" for d_norm, which
+C won't cause any confusion since it's only the normalized divisor that's of
+C any use in the code. "b" is written for 2^N, the size of a limb, N being
+C 32 here.
+C
+C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
+C "n-d - q1*d". This rearrangement gives the same two-limb answer but lets
+C us have just a psubq on the dependent chain.
+C
+C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
+C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
+C
+C Notes:
+C
+C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
+C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero
+C carry, since in normal circumstances that will be a very rare event.
+C
+C The test for skipping a division is branch free (once size>=1 is tested).
+C The store to the destination high limb is 0 when a divide is skipped, or
+C if it's not skipped then a copy of the src high limb is stored. The
+C latter is in case src==dst.
+C
+C There's a small bias towards expecting xsize==0, by having code for
+C xsize==0 in a straight line and xsize!=0 under forward jumps.
+C
+C Enhancements:
+C
+C The loop measures 32 cycles, but the dependent chain would suggest it
+C could be done with 30. Not sure where to start looking for the extras.
+C
+C Alternatives:
+C
+C If the divisor is normalized (high bit set) then a division step can
+C always be skipped, since the high destination limb is always 0 or 1 in
+C that case. It doesn't seem worth checking for this though, since it
+C probably occurs infrequently.
+
+
+dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
+dnl
+dnl The inverse takes about 80-90 cycles to calculate, but after that the
+dnl multiply is 32 c/l versus division at about 58 c/l.
+dnl
+dnl At 4 limbs the div is a touch faster than the mul (and of course
+dnl simpler), so start the mul from 5 limbs.
+
+deflit(MUL_THRESHOLD, 5)
+
+
+defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
+defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
+defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC, 12)
+defframe(PARAM_XSIZE, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_ESI,`PARAM_SIZE')
+define(SAVE_EBP,`PARAM_SRC')
+define(SAVE_EDI,`PARAM_DIVISOR')
+define(SAVE_EBX,`PARAM_DST')
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_preinv_divrem_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ xorl %edx, %edx C carry if can't skip a div
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl -4(%esi,%ecx,4), %eax C src high limb
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+
+ movd PARAM_PREINV_INVERSE, %mm4
+
+ movd PARAM_PREINV_SHIFT, %mm7 C l
+ cmpl %ebp, %eax C high cmp divisor
+
+ cmovc( %eax, %edx) C high is carry if high<divisor
+ movd %edx, %mm0 C carry
+
+ movd %edx, %mm1 C carry
+ movl $0, %edx
+
+ movd %ebp, %mm5 C d
+ cmovnc( %eax, %edx) C 0 if skip div, src high if not
+ C (the latter in case src==dst)
+ leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
+
+ movl %edx, (%edi,%ecx,4) C dst high limb
+ sbbl $0, %ecx C skip one division if high<divisor
+ movl $32, %eax
+
+ subl PARAM_PREINV_SHIFT, %eax
+ psllq %mm7, %mm5 C d normalized
+ leal (%edi,%ecx,4), %edi C &dst[xsize+size-1]
+ leal -4(%esi,%ecx,4), %esi C &src[size-1]
+
+ movd %eax, %mm6 C 32-l
+ jmp L(start_preinv)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+
+ movl PARAM_CARRY, %edx
+
+ movl PARAM_SIZE, %ecx
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+
+ leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ xorl %edx, %edx C initial carry (if can't skip a div)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_DIVISOR, %ebp
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_XSIZE, %ebx
+ leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
+
+ orl %ecx, %ecx C size
+ jz L(no_skip_div) C if size==0
+ movl -4(%esi,%ecx,4), %eax C src high limb
+
+ cmpl %ebp, %eax C high cmp divisor
+
+ cmovnc( %eax, %edx) C 0 if skip div, src high if not
+ movl %edx, (%edi,%ecx,4) C dst high limb
+
+ movl $0, %edx
+ cmovc( %eax, %edx) C high is carry if high<divisor
+
+ sbbl $0, %ecx C size-1 if high<divisor
+L(no_skip_div):
+
+
+L(start_1c):
+ C eax
+ C ebx xsize
+ C ecx size
+ C edx carry
+ C esi src
+ C edi &dst[xsize-1]
+ C ebp divisor
+
+ leal (%ebx,%ecx), %eax C size+xsize
+ leal -4(%esi,%ecx,4), %esi C &src[size-1]
+ leal (%edi,%ecx,4), %edi C &dst[size+xsize-1]
+
+ cmpl $MUL_THRESHOLD, %eax
+ jae L(mul_by_inverse)
+
+
+ orl %ecx, %ecx
+ jz L(divide_no_integer) C if size==0
+
+L(divide_integer):
+ C eax scratch (quotient)
+ C ebx xsize
+ C ecx counter
+ C edx carry
+ C esi src, decrementing
+ C edi dst, decrementing
+ C ebp divisor
+
+ movl (%esi), %eax
+ subl $4, %esi
+
+ divl %ebp
+
+ movl %eax, (%edi)
+ subl $4, %edi
+
+ subl $1, %ecx
+ jnz L(divide_integer)
+
+
+L(divide_no_integer):
+ orl %ebx, %ebx
+ jnz L(divide_fraction) C if xsize!=0
+
+L(divide_done):
+ movl SAVE_ESI, %esi
+ movl SAVE_EDI, %edi
+ movl SAVE_EBX, %ebx
+ movl SAVE_EBP, %ebp
+ movl %edx, %eax
+ ret
+
+
+L(divide_fraction):
+ C eax scratch (quotient)
+ C ebx counter
+ C ecx
+ C edx carry
+ C esi
+ C edi dst, decrementing
+ C ebp divisor
+
+ movl $0, %eax
+
+ divl %ebp
+
+ movl %eax, (%edi)
+ subl $4, %edi
+
+ subl $1, %ebx
+ jnz L(divide_fraction)
+
+ jmp L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+ C eax
+ C ebx xsize
+ C ecx size
+ C edx carry
+ C esi &src[size-1]
+ C edi &dst[size+xsize-1]
+ C ebp divisor
+
+ bsrl %ebp, %eax C 31-l
+ movd %edx, %mm0 C carry
+ movd %edx, %mm1 C carry
+ movl %ecx, %edx C size
+ movl $31, %ecx
+
+ C
+
+ xorl %eax, %ecx C l = leading zeros on d
+ addl $1, %eax
+
+ shll %cl, %ebp C d normalized
+ movd %ecx, %mm7 C l
+ movl %edx, %ecx C size
+
+ movd %eax, %mm6 C 32-l
+ movl $-1, %edx
+ movl $-1, %eax
+
+ C
+
+ subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
+
+ divl %ebp C floor (b*(b-d)-1 / d)
+ movd %ebp, %mm5 C d
+
+ C
+
+ movd %eax, %mm4 C m
+
+
+L(start_preinv):
+ C eax inverse
+ C ebx xsize
+ C ecx size
+ C edx
+ C esi &src[size-1]
+ C edi &dst[size+xsize-1]
+ C ebp
+ C
+ C mm0 carry
+ C mm1 carry
+ C mm2
+ C mm4 m
+ C mm5 d
+ C mm6 31-l
+ C mm7 l
+
+ psllq %mm7, %mm0 C n2 = carry << l, for size==0
+
+ subl $1, %ecx
+ jb L(integer_none)
+
+ movd (%esi), %mm0 C src high limb
+ punpckldq %mm1, %mm0
+ psrlq %mm6, %mm0 C n2 = high (carry:srchigh << l)
+ jz L(integer_last)
+
+
+C The dependent chain here consists of
+C
+C 2 paddd n1+n2
+C 8 pmuludq m*(n1+n2)
+C 2 paddq n2:nadj + m*(n1+n2)
+C 2 psrlq q1
+C 8 pmuludq d*q1
+C 2 psubq (n-d)-q1*d
+C 2 psrlq high n-(q1+1)*d mask
+C 2 pand d masked
+C 2 paddd n2+d addback
+C --
+C 30
+C
+C But it seems to run at 32 cycles, so presumably there's something else
+C going on.
+
+ ALIGN(16)
+L(integer_top):
+ C eax
+ C ebx
+ C ecx counter, size-1 to 0
+ C edx
+ C esi src, decrementing
+ C edi dst, decrementing
+ C
+ C mm0 n2
+ C mm4 m
+ C mm5 d
+ C mm6 32-l
+ C mm7 l
+
+ ASSERT(b,`C n2<d
+ movd %mm0, %eax
+ movd %mm5, %edx
+ cmpl %edx, %eax')
+
+ movd -4(%esi), %mm1 C next src limbs
+ movd (%esi), %mm2
+ leal -4(%esi), %esi
+
+ punpckldq %mm2, %mm1
+ psrlq %mm6, %mm1 C n10
+
+ movq %mm1, %mm2 C n10
+ movq %mm1, %mm3 C n10
+ psrad $31, %mm1 C -n1
+ pand %mm5, %mm1 C -n1 & d
+ paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
+
+ psrld $31, %mm2 C n1
+ paddd %mm0, %mm2 C n2+n1
+ punpckldq %mm0, %mm1 C n2:nadj
+
+ pmuludq %mm4, %mm2 C m*(n2+n1)
+
+ C
+
+ paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
+ pxor %mm2, %mm2 C break dependency, saves 4 cycles
+ pcmpeqd %mm2, %mm2 C FF...FF
+ psrlq $63, %mm2 C 1
+
+ psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
+
+ paddd %mm1, %mm2 C q1+1
+ pmuludq %mm5, %mm1 C q1*d
+
+ punpckldq %mm0, %mm3 C n = n2:n10
+ pxor %mm0, %mm0
+
+ psubq %mm5, %mm3 C n - d
+
+ C
+
+ psubq %mm1, %mm3 C n - (q1+1)*d
+
+ por %mm3, %mm0 C copy remainder -> new n2
+ psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
+
+ ASSERT(be,`C 0 or -1
+ movd %mm3, %eax
+ addl $1, %eax
+ cmpl $1, %eax')
+
+ paddd %mm3, %mm2 C q
+ pand %mm5, %mm3 C mask & d
+
+ paddd %mm3, %mm0 C addback if necessary
+ movd %mm2, (%edi)
+ leal -4(%edi), %edi
+
+ subl $1, %ecx
+ ja L(integer_top)
+
+
+L(integer_last):
+ C eax
+ C ebx xsize
+ C ecx
+ C edx
+ C esi &src[0]
+ C edi &dst[xsize]
+ C
+ C mm0 n2
+ C mm4 m
+ C mm5 d
+ C mm6
+ C mm7 l
+
+ ASSERT(b,`C n2<d
+ movd %mm0, %eax
+ movd %mm5, %edx
+ cmpl %edx, %eax')
+
+ movd (%esi), %mm1 C src[0]
+ psllq %mm7, %mm1 C n10
+
+ movq %mm1, %mm2 C n10
+ movq %mm1, %mm3 C n10
+ psrad $31, %mm1 C -n1
+ pand %mm5, %mm1 C -n1 & d
+ paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
+
+ psrld $31, %mm2 C n1
+ paddd %mm0, %mm2 C n2+n1
+ punpckldq %mm0, %mm1 C n2:nadj
+
+ pmuludq %mm4, %mm2 C m*(n2+n1)
+
+ C
+
+ paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
+ pcmpeqd %mm2, %mm2 C FF...FF
+ psrlq $63, %mm2 C 1
+
+ psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
+ paddd %mm1, %mm2 C q1
+
+ pmuludq %mm5, %mm1 C q1*d
+ punpckldq %mm0, %mm3 C n
+ psubq %mm5, %mm3 C n - d
+ pxor %mm0, %mm0
+
+ C
+
+ psubq %mm1, %mm3 C n - (q1+1)*d
+
+ por %mm3, %mm0 C remainder -> n2
+ psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
+
+ ASSERT(be,`C 0 or -1
+ movd %mm3, %eax
+ addl $1, %eax
+ cmpl $1, %eax')
+
+ paddd %mm3, %mm2 C q
+ pand %mm5, %mm3 C mask & d
+
+ paddd %mm3, %mm0 C addback if necessary
+ movd %mm2, (%edi)
+ leal -4(%edi), %edi
+
+
+L(integer_none):
+ C eax
+ C ebx xsize
+
+ orl %ebx, %ebx
+ jnz L(fraction_some) C if xsize!=0
+
+
+L(fraction_done):
+ movl SAVE_EBP, %ebp
+ psrld %mm7, %mm0 C remainder
+
+ movl SAVE_EDI, %edi
+ movd %mm0, %eax
+
+ movl SAVE_ESI, %esi
+ movl SAVE_EBX, %ebx
+ emms
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+
+L(fraction_some):
+ C eax
+ C ebx xsize
+ C ecx
+ C edx
+ C esi
+ C edi &dst[xsize-1]
+ C ebp
+
+
+L(fraction_top):
+ C eax
+ C ebx counter, xsize iterations
+ C ecx
+ C edx
+ C esi src, decrementing
+ C edi dst, decrementing
+ C
+ C mm0 n2
+ C mm4 m
+ C mm5 d
+ C mm6 32-l
+ C mm7 l
+
+ ASSERT(b,`C n2<d
+ movd %mm0, %eax
+ movd %mm5, %edx
+ cmpl %edx, %eax')
+
+ movq %mm0, %mm1 C n2
+ pmuludq %mm4, %mm0 C m*n2
+
+ pcmpeqd %mm2, %mm2
+ psrlq $63, %mm2
+
+ C
+
+ psrlq $32, %mm0 C high(m*n2)
+
+ paddd %mm1, %mm0 C q1 = high(n2:0 + m*n2)
+
+ paddd %mm0, %mm2 C q1+1
+ pmuludq %mm5, %mm0 C q1*d
+
+ psllq $32, %mm1 C n = n2:0
+ psubq %mm5, %mm1 C n - d
+
+ C
+
+ psubq %mm0, %mm1 C r = n - (q1+1)*d
+ pxor %mm0, %mm0
+
+ por %mm1, %mm0 C r -> n2
+ psrlq $32, %mm1 C high n - (q1+1)*d, 0 or -1
+
+ ASSERT(be,`C 0 or -1
+ movd %mm1, %eax
+ addl $1, %eax
+ cmpl $1, %eax')
+
+ paddd %mm1, %mm2 C q
+ pand %mm5, %mm1 C mask & d
+
+ paddd %mm1, %mm0 C addback if necessary
+ movd %mm2, (%edi)
+ leal -4(%edi), %edi
+
+ subl $1, %ebx
+ jne L(fraction_top)
+
+
+ jmp L(fraction_done)
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h b/gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h
new file mode 100644
index 0000000..a047a51
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/gmp-mparam.h
@@ -0,0 +1,213 @@
+/* Intel Pentium-4 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2600 MHz P4 Northwood */
+/* FFT tuning limit = 23,700,309 */
+/* Generated by tuneup.c, 2019-11-09, gcc 8.2 */
+
+#define MOD_1_NORM_THRESHOLD 5
+#define MOD_1_UNNORM_THRESHOLD 14
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 5
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 13
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 7
+#define USE_PREINV_DIVREM_1 1 /* native */
+#define DIV_QR_1N_PI1_METHOD 2 /* 4.36% faster than 1 */
+#define DIV_QR_1_NORM_THRESHOLD 16
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 21
+
+#define DIV_1_VS_MUL_1_PERCENT 358
+
+#define MUL_TOOM22_THRESHOLD 26
+#define MUL_TOOM33_THRESHOLD 101
+#define MUL_TOOM44_THRESHOLD 284
+#define MUL_TOOM6H_THRESHOLD 406
+#define MUL_TOOM8H_THRESHOLD 592
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 101
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 191
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 189
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 195
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 151
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 51
+#define SQR_TOOM3_THRESHOLD 163
+#define SQR_TOOM4_THRESHOLD 254
+#define SQR_TOOM6_THRESHOLD 614
+#define SQR_TOOM8_THRESHOLD 842
+
+#define MULMID_TOOM42_THRESHOLD 58
+
+#define MULMOD_BNM1_THRESHOLD 19
+#define SQRMOD_BNM1_THRESHOLD 23
+
+#define MUL_FFT_MODF_THRESHOLD 824 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 824, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \
+ { 17, 5}, { 36, 6}, { 19, 5}, { 39, 6}, \
+ { 29, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \
+ { 36, 7}, { 19, 6}, { 39, 7}, { 21, 6}, \
+ { 43, 7}, { 23, 6}, { 48, 7}, { 29, 8}, \
+ { 15, 7}, { 37, 8}, { 19, 7}, { 43, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 7}, { 55, 8}, \
+ { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
+ { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 99, 9}, { 55,10}, \
+ { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \
+ { 31,10}, { 63, 9}, { 143,10}, { 79, 9}, \
+ { 167,10}, { 95, 9}, { 191,10}, { 111,11}, \
+ { 63,10}, { 127, 9}, { 255,10}, { 159,11}, \
+ { 95,10}, { 191,12}, { 63,11}, { 127,10}, \
+ { 271,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 335,11}, { 191,10}, { 383, 9}, { 799,10}, \
+ { 415,11}, { 223,12}, { 127,11}, { 255,10}, \
+ { 527,11}, { 287,10}, { 607, 9}, { 1215,11}, \
+ { 319,10}, { 671,12}, { 191,11}, { 383,10}, \
+ { 799,11}, { 415,10}, { 863,13}, { 127,12}, \
+ { 255,11}, { 543,10}, { 1119, 9}, { 2239,11}, \
+ { 607,10}, { 1215,12}, { 319,11}, { 671,10}, \
+ { 1343,11}, { 735,10}, { 1471, 9}, { 2943,12}, \
+ { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \
+ { 447,11}, { 927,10}, { 1855,11}, { 959,13}, \
+ { 255,12}, { 511,11}, { 1119,12}, { 575,11}, \
+ { 1215,10}, { 2431,11}, { 1247,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1471,10}, { 2943,13}, \
+ { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \
+ { 1727,10}, { 3455,12}, { 895,14}, { 255,13}, \
+ { 511,12}, { 1087,11}, { 2239,10}, { 4479,12}, \
+ { 1215,11}, { 2431,13}, { 639,12}, { 1343,11}, \
+ { 2687,12}, { 1471,11}, { 2943,13}, { 767,12}, \
+ { 1727,11}, { 3455,13}, { 895,12}, { 1983,14}, \
+ { 511,13}, { 1023,12}, { 2239,13}, { 1151,12}, \
+ { 2495,11}, { 4991,13}, { 1407,12}, { 2943,14}, \
+ { 767,13}, { 1535,12}, { 3135,13}, { 1663,12}, \
+ { 3455,13}, { 1919,12}, { 3967,15}, { 511,14}, \
+ { 1023,13}, { 2175,12}, { 4479,13}, { 2431,12}, \
+ { 4991,14}, { 1279,13}, { 2687,12}, { 5503,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 167
+#define MUL_FFT_THRESHOLD 7808
+
+#define SQR_FFT_MODF_THRESHOLD 560 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 560, 5}, { 33, 6}, { 17, 5}, { 35, 6}, \
+ { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \
+ { 39, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \
+ { 23, 7}, { 47, 8}, { 27, 7}, { 55, 8}, \
+ { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
+ { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \
+ { 79, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
+ { 31, 9}, { 79,10}, { 47, 9}, { 95,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 159,10}, { 111,11}, { 63,10}, { 143, 9}, \
+ { 287,10}, { 159,11}, { 95,10}, { 191,12}, \
+ { 63,11}, { 127, 9}, { 511, 8}, { 1023, 9}, \
+ { 527,11}, { 159,10}, { 319, 9}, { 639,10}, \
+ { 351,11}, { 191,10}, { 431,11}, { 223,12}, \
+ { 127,11}, { 255,10}, { 543,11}, { 287,10}, \
+ { 607, 9}, { 1215,11}, { 319,10}, { 639,11}, \
+ { 351,12}, { 191,11}, { 383,10}, { 767,11}, \
+ { 415,10}, { 831,13}, { 127,12}, { 255,11}, \
+ { 543,10}, { 1119,11}, { 607,12}, { 319,11}, \
+ { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \
+ { 799,10}, { 1599,11}, { 863,12}, { 447,11}, \
+ { 927,10}, { 1855,11}, { 991,13}, { 255,12}, \
+ { 511,11}, { 1055,10}, { 2111,11}, { 1087,12}, \
+ { 575,11}, { 1215,10}, { 2431,12}, { 639,11}, \
+ { 1343,12}, { 703,11}, { 1407,13}, { 383,12}, \
+ { 767,11}, { 1599,12}, { 831,11}, { 1727,10}, \
+ { 3455,12}, { 895,11}, { 1855,12}, { 959,14}, \
+ { 255,13}, { 511,12}, { 1087,11}, { 2239,12}, \
+ { 1215,11}, { 2431,13}, { 639,12}, { 1471,11}, \
+ { 2943,13}, { 767,12}, { 1727,11}, { 3455,13}, \
+ { 895,12}, { 1983,14}, { 511,13}, { 1023,12}, \
+ { 2239,13}, { 1151,12}, { 2495,11}, { 4991,13}, \
+ { 1279,12}, { 2623,13}, { 1407,12}, { 2943,14}, \
+ { 767,13}, { 1663,12}, { 3455,13}, { 1919,12}, \
+ { 3839,15}, { 511,14}, { 1023,13}, { 2175,12}, \
+ { 4479,13}, { 2431,12}, { 4991,14}, { 1279,13}, \
+ { 2687,12}, { 5503,13}, { 8192,14}, { 16384,15}, \
+ { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 149
+#define SQR_FFT_THRESHOLD 4800
+
+#define MULLO_BASECASE_THRESHOLD 12
+#define MULLO_DC_THRESHOLD 44
+#define MULLO_MUL_N_THRESHOLD 14281
+#define SQRLO_BASECASE_THRESHOLD 13
+#define SQRLO_DC_THRESHOLD 42
+#define SQRLO_SQR_THRESHOLD 9449
+
+#define DC_DIV_QR_THRESHOLD 38
+#define DC_DIVAPPR_Q_THRESHOLD 105
+#define DC_BDIV_QR_THRESHOLD 52
+#define DC_BDIV_Q_THRESHOLD 83
+
+#define INV_MULMOD_BNM1_THRESHOLD 50
+#define INV_NEWTON_THRESHOLD 158
+#define INV_APPR_THRESHOLD 118
+
+#define BINV_NEWTON_THRESHOLD 342
+#define REDC_1_TO_REDC_N_THRESHOLD 67
+
+#define MU_DIV_QR_THRESHOLD 2130
+#define MU_DIVAPPR_Q_THRESHOLD 1895
+#define MUPI_DIV_QR_THRESHOLD 60
+#define MU_BDIV_QR_THRESHOLD 1652
+#define MU_BDIV_Q_THRESHOLD 2089
+
+#define POWM_SEC_TABLE 1,22,96,446,723,1378
+
+#define GET_STR_DC_THRESHOLD 13
+#define GET_STR_PRECOMPUTE_THRESHOLD 20
+#define SET_STR_DC_THRESHOLD 298
+#define SET_STR_PRECOMPUTE_THRESHOLD 960
+
+#define FAC_DSC_THRESHOLD 212
+#define FAC_ODD_THRESHOLD 71
+
+#define MATRIX22_STRASSEN_THRESHOLD 26
+#define HGCD2_DIV1_METHOD 3 /* 0.68% faster than 1 */
+#define HGCD_THRESHOLD 80
+#define HGCD_APPR_THRESHOLD 138
+#define HGCD_REDUCE_THRESHOLD 4455
+#define GCD_DC_THRESHOLD 365
+#define GCDEXT_DC_THRESHOLD 245
+#define JACOBI_BASE_METHOD 4 /* 23.41% faster than 1 */
+
+/* Tuneup completed successfully, took 63807 seconds */
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm
new file mode 100644
index 0000000..ee88bab
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm
@@ -0,0 +1,166 @@
+dnl x86-32 mpn_mod_1_1p for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C * Optimize. The present code was written quite straightforwardly.
+C * Optimize post-loop reduction code; it is from mod_1s_4p, thus overkill.
+C * Write a cps function that uses sse2 insns.
+
+C cycles/limb
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 16
+C P4 model 3-4 (Prescott) 18
+
+C INPUT PARAMETERS
+C ap sp + 4
+C n sp + 8
+C b sp + 12
+C cps sp + 16
+
+define(`B1modb', `%mm1')
+define(`B2modb', `%mm2')
+define(`ap', `%edx')
+define(`n', `%eax')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_1_1p)
+ push %ebx
+ mov 8(%esp), ap
+ mov 12(%esp), n
+ mov 20(%esp), %ecx
+ movd 8(%ecx), B1modb
+ movd 12(%ecx), B2modb
+
+ lea -4(ap,n,4), ap
+
+C FIXME: See comment in generic/mod_1_1.c.
+ movd (ap), %mm7
+ movd -4(ap), %mm4
+ pmuludq B1modb, %mm7
+ paddq %mm4, %mm7
+ add $-2, n
+ jz L(end)
+
+ ALIGN(8)
+L(top): movq %mm7, %mm6
+ psrlq $32, %mm7 C rh
+ movd -8(ap), %mm0
+ add $-4, ap
+ pmuludq B2modb, %mm7
+ pmuludq B1modb, %mm6
+ add $-1, n
+ paddq %mm0, %mm7
+ paddq %mm6, %mm7
+ jnz L(top)
+
+L(end): pcmpeqd %mm4, %mm4
+ psrlq $32, %mm4 C 0x00000000FFFFFFFF
+ pand %mm7, %mm4 C rl
+ psrlq $32, %mm7 C rh
+ pmuludq B1modb, %mm7 C rh,cl
+ paddq %mm4, %mm7 C rh,rl
+ movd 4(%ecx), %mm4 C cnt
+ psllq %mm4, %mm7 C rh,rl normalized
+ movq %mm7, %mm2 C rl in low half
+ psrlq $32, %mm7 C rh
+ movd (%ecx), %mm1 C bi
+ pmuludq %mm7, %mm1 C qh,ql
+ paddq %mm2, %mm1 C qh-1,ql
+ movd %mm1, %ecx C ql
+ psrlq $32, %mm1 C qh-1
+ movd 16(%esp), %mm3 C b
+ pmuludq %mm1, %mm3 C (qh-1) * b
+ psubq %mm3, %mm2 C r in low half (could use psubd)
+ movd %mm2, %eax C r
+ mov 16(%esp), %ebx
+ sub %ebx, %eax C r
+ cmp %eax, %ecx
+ lea (%eax,%ebx), %edx
+ cmovc( %edx, %eax)
+ movd %mm4, %ecx C cnt
+ cmp %ebx, %eax
+ jae L(fix)
+ emms
+ pop %ebx
+ shr %cl, %eax
+ ret
+
+L(fix): sub %ebx, %eax
+ emms
+ pop %ebx
+ shr %cl, %eax
+ ret
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1_1p_cps)
+C CAUTION: This is the same code as in k7/mod_1_1.asm
+ push %ebp
+ mov 12(%esp), %ebp
+ push %esi
+ bsr %ebp, %ecx
+ push %ebx
+ xor $31, %ecx
+ mov 16(%esp), %esi
+ sal %cl, %ebp
+ mov %ebp, %edx
+ not %edx
+ mov $-1, %eax
+ div %ebp
+ mov %eax, (%esi) C store bi
+ mov %ecx, 4(%esi) C store cnt
+ xor %ebx, %ebx
+ sub %ebp, %ebx
+ mov $1, %edx
+ shld %cl, %eax, %edx
+ imul %edx, %ebx
+ mul %ebx
+ add %ebx, %edx
+ not %edx
+ imul %ebp, %edx
+ add %edx, %ebp
+ cmp %edx, %eax
+ cmovc( %ebp, %edx)
+ shr %cl, %ebx
+ mov %ebx, 8(%esi) C store B1modb
+ shr %cl, %edx
+ mov %edx, 12(%esi) C store B2modb
+ pop %ebx
+ pop %esi
+ pop %ebp
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm
new file mode 100644
index 0000000..eb2edb6
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_4.asm
@@ -0,0 +1,269 @@
+dnl x86-32 mpn_mod_1s_4p for Pentium 4 and P6 models with SSE2 (i.e. 9,D,E,F).
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C * Optimize. The present code was written quite straightforwardly.
+C * Optimize post-loop reduction code.
+C * Write a cps function that uses sse2 insns.
+
+C cycles/limb
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 3.4
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 4
+C P4 model 3-4 (Prescott) 4.5
+
+C INPUT PARAMETERS
+C ap sp + 4
+C n sp + 8
+C b sp + 12
+C cps sp + 16
+
+define(`B1modb', `%mm1')
+define(`B2modb', `%mm2')
+define(`B3modb', `%mm3')
+define(`B4modb', `%mm4')
+define(`B5modb', `%mm5')
+define(`ap', `%edx')
+define(`n', `%eax')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p)
+ push %ebx
+ mov 8(%esp), ap
+ mov 12(%esp), n
+ mov 20(%esp), %ecx
+
+ movd 8(%ecx), B1modb
+ movd 12(%ecx), B2modb
+ movd 16(%ecx), B3modb
+ movd 20(%ecx), B4modb
+ movd 24(%ecx), B5modb
+
+ mov n, %ebx
+ lea -4(ap,n,4), ap
+ and $3, %ebx
+ je L(b0)
+ cmp $2, %ebx
+ jc L(b1)
+ je L(b2)
+
+L(b3): movd -4(ap), %mm7
+ pmuludq B1modb, %mm7
+ movd -8(ap), %mm6
+ paddq %mm6, %mm7
+ movd (ap), %mm6
+ pmuludq B2modb, %mm6
+ paddq %mm6, %mm7
+ lea -24(ap), ap
+ add $-3, n
+ jz L(end)
+ jmp L(top)
+
+L(b0): movd -8(ap), %mm7
+ pmuludq B1modb, %mm7
+ movd -12(ap), %mm6
+ paddq %mm6, %mm7
+ movd -4(ap), %mm6
+ pmuludq B2modb, %mm6
+ paddq %mm6, %mm7
+ movd (ap), %mm6
+ pmuludq B3modb, %mm6
+ paddq %mm6, %mm7
+ lea -28(ap), ap
+ add $-4, n
+ jz L(end)
+ jmp L(top)
+
+L(b1): movd (ap), %mm7
+ lea -16(ap), ap
+ dec n
+ jz L(x)
+ jmp L(top)
+
+L(b2): movd -4(ap), %mm7 C rl
+ punpckldq (ap), %mm7 C rh
+ lea -20(ap), ap
+ add $-2, n
+ jz L(end)
+
+ ALIGN(8)
+L(top): movd 4(ap), %mm0
+ pmuludq B1modb, %mm0
+ movd 0(ap), %mm6
+ paddq %mm6, %mm0
+
+ movd 8(ap), %mm6
+ pmuludq B2modb, %mm6
+ paddq %mm6, %mm0
+
+ movd 12(ap), %mm6
+ pmuludq B3modb, %mm6
+ paddq %mm6, %mm0
+
+ movq %mm7, %mm6
+ psrlq $32, %mm7 C rh
+ pmuludq B5modb, %mm7
+ pmuludq B4modb, %mm6
+
+ paddq %mm0, %mm7
+ paddq %mm6, %mm7
+
+ add $-16, ap
+ add $-4, n
+ jnz L(top)
+
+L(end): pcmpeqd %mm4, %mm4
+ psrlq $32, %mm4 C 0x00000000FFFFFFFF
+ pand %mm7, %mm4 C rl
+ psrlq $32, %mm7 C rh
+ pmuludq B1modb, %mm7 C rh,cl
+ paddq %mm4, %mm7 C rh,rl
+L(x): movd 4(%ecx), %mm4 C cnt
+ psllq %mm4, %mm7 C rh,rl normalized
+ movq %mm7, %mm2 C rl in low half
+ psrlq $32, %mm7 C rh
+ movd (%ecx), %mm1 C bi
+ pmuludq %mm7, %mm1 C qh,ql
+ paddq %mm2, %mm1 C qh-1,ql
+ movd %mm1, %ecx C ql
+ psrlq $32, %mm1 C qh-1
+ movd 16(%esp), %mm3 C b
+ pmuludq %mm1, %mm3 C (qh-1) * b
+ psubq %mm3, %mm2 C r in low half (could use psubd)
+ movd %mm2, %eax C r
+ mov 16(%esp), %ebx
+ sub %ebx, %eax C r
+ cmp %eax, %ecx
+ lea (%eax,%ebx), %edx
+ cmovc( %edx, %eax)
+ movd %mm4, %ecx C cnt
+ cmp %ebx, %eax
+ jae L(fix)
+ emms
+ pop %ebx
+ shr %cl, %eax
+ ret
+
+L(fix): sub %ebx, %eax
+ emms
+ pop %ebx
+ shr %cl, %eax
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p_cps)
+C CAUTION: This is the same code as in k7/mod_1_4.asm
+ push %ebp
+ push %edi
+ push %esi
+ push %ebx
+ mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx
+ mov 24(%esp), %ebx
+ bsr %ebx, %ecx
+ xor $31, %ecx
+ sal %cl, %ebx C b << cnt
+ mov %ebx, %edx
+ not %edx
+ mov $-1, %eax
+ div %ebx
+ xor %edi, %edi
+ sub %ebx, %edi
+ mov $1, %esi
+ mov %eax, (%ebp) C store bi
+ mov %ecx, 4(%ebp) C store cnt
+ shld %cl, %eax, %esi
+ imul %edi, %esi
+ mov %eax, %edi
+ mul %esi
+
+ add %esi, %edx
+ shr %cl, %esi
+ mov %esi, 8(%ebp) C store B1modb
+
+ not %edx
+ imul %ebx, %edx
+ lea (%edx,%ebx), %esi
+ cmp %edx, %eax
+ cmovnc( %edx, %esi)
+ mov %edi, %eax
+ mul %esi
+
+ add %esi, %edx
+ shr %cl, %esi
+ mov %esi, 12(%ebp) C store B2modb
+
+ not %edx
+ imul %ebx, %edx
+ lea (%edx,%ebx), %esi
+ cmp %edx, %eax
+ cmovnc( %edx, %esi)
+ mov %edi, %eax
+ mul %esi
+
+ add %esi, %edx
+ shr %cl, %esi
+ mov %esi, 16(%ebp) C store B3modb
+
+ not %edx
+ imul %ebx, %edx
+ lea (%edx,%ebx), %esi
+ cmp %edx, %eax
+ cmovnc( %edx, %esi)
+ mov %edi, %eax
+ mul %esi
+
+ add %esi, %edx
+ shr %cl, %esi
+ mov %esi, 20(%ebp) C store B4modb
+
+ not %edx
+ imul %ebx, %edx
+ add %edx, %ebx
+ cmp %edx, %eax
+ cmovnc( %edx, %ebx)
+
+ shr %cl, %ebx
+ mov %ebx, 24(%ebp) C store B5modb
+
+ pop %ebx
+ pop %esi
+ pop %edi
+ pop %ebp
+ ret
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm
new file mode 100644
index 0000000..31e25b7
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_34lsub1.asm
@@ -0,0 +1,175 @@
+dnl Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
+
+dnl Copyright 2000-2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C Pentium4: 1.0 cycles/limb
+
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C Enhancements:
+C
+C There might a couple of cycles to save by using plain integer code for
+C more small sizes. 2 limbs measures about 20 cycles, but 3 limbs jumps to
+C about 46 (inclusive of some function call overheads).
+
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX, `PARAM_SRC')
+define(SAVE_ESI, `PARAM_SIZE')
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mod_34lsub1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %edx
+ movl (%edx), %eax
+
+ subl $2, %ecx
+ ja L(three_or_more)
+ jne L(one)
+
+ movl 4(%edx), %edx
+ movl %eax, %ecx
+ shrl $24, %eax C src[0] high
+
+ andl $0x00FFFFFF, %ecx C src[0] low
+ addl %ecx, %eax
+
+ movl %edx, %ecx
+ shll $8, %edx
+
+ shrl $16, %ecx C src[1] low
+ addl %ecx, %eax
+
+ andl $0x00FFFF00, %edx C src[1] high
+ addl %edx, %eax
+
+L(one):
+ ret
+
+
+L(three_or_more):
+ pxor %mm0, %mm0
+ pxor %mm1, %mm1
+ pxor %mm2, %mm2
+
+ pcmpeqd %mm7, %mm7
+ psrlq $32, %mm7 C 0x00000000FFFFFFFF, low 32 bits
+
+ pcmpeqd %mm6, %mm6
+ psrlq $40, %mm6 C 0x0000000000FFFFFF, low 24 bits
+
+L(top):
+ C eax
+ C ebx
+ C ecx counter, size-2 to 0, -1 or -2
+ C edx src, incrementing
+ C
+ C mm0 sum 0mod3
+ C mm1 sum 1mod3
+ C mm2 sum 2mod3
+ C mm3
+ C mm4
+ C mm5
+ C mm6 0x0000000000FFFFFF
+ C mm7 0x00000000FFFFFFFF
+
+ movd (%edx), %mm3
+ paddq %mm3, %mm0
+
+ movd 4(%edx), %mm3
+ paddq %mm3, %mm1
+
+ movd 8(%edx), %mm3
+ paddq %mm3, %mm2
+
+ addl $12, %edx
+ subl $3, %ecx
+ ja L(top)
+
+
+ C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
+
+ addl $1, %ecx
+ js L(combine) C 0 more
+
+ movd (%edx), %mm3
+ paddq %mm3, %mm0
+
+ jz L(combine) C 1 more
+
+ movd 4(%edx), %mm3
+ paddq %mm3, %mm1
+
+L(combine):
+ movq %mm7, %mm3 C low halves
+ pand %mm0, %mm3
+
+ movq %mm7, %mm4
+ pand %mm1, %mm4
+
+ movq %mm7, %mm5
+ pand %mm2, %mm5
+
+ psrlq $32, %mm0 C high halves
+ psrlq $32, %mm1
+ psrlq $32, %mm2
+
+ paddq %mm0, %mm4 C fold high halves to give 33 bits each
+ paddq %mm1, %mm5
+ paddq %mm2, %mm3
+
+ psllq $8, %mm4 C combine at respective offsets
+ psllq $16, %mm5
+ paddq %mm4, %mm3
+ paddq %mm5, %mm3 C 0x000cxxxxxxxxxxxx, 50 bits
+
+ pand %mm3, %mm6 C fold at 24 bits
+ psrlq $24, %mm3
+
+ paddq %mm6, %mm3
+ movd %mm3, %eax
+
+ ASSERT(z, C nothing left in high dword
+ `psrlq $32, %mm3
+ movd %mm3, %ecx
+ orl %ecx, %ecx')
+
+ emms
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm
new file mode 100644
index 0000000..aa9ef31
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/mode1o.asm
@@ -0,0 +1,175 @@
+dnl Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
+
+dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 19.0 cycles/limb
+
+
+C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor, mp_limb_t carry);
+C
+
+defframe(PARAM_CARRY, 16)
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+ TEXT
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1c_odd)
+deflit(`FRAME',0)
+
+ movd PARAM_CARRY, %mm1
+ jmp L(start_1c)
+
+EPILOGUE()
+
+
+ ALIGN(16)
+PROLOGUE(mpn_modexact_1_odd)
+deflit(`FRAME',0)
+
+ pxor %mm1, %mm1 C carry limb
+L(start_1c):
+ movl PARAM_DIVISOR, %eax
+
+ movd PARAM_DIVISOR, %mm7
+
+ shrl %eax
+
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %edx)
+ movzbl (%eax,%edx), %eax C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ C
+
+ movd %eax, %mm6 C inv
+
+ movd %eax, %mm0 C inv
+
+ pmuludq %mm6, %mm6 C inv*inv
+
+ C
+
+ pmuludq %mm7, %mm6 C inv*inv*d
+ paddd %mm0, %mm0 C 2*inv
+
+ C
+
+ psubd %mm6, %mm0 C inv = 2*inv - inv*inv*d
+ pxor %mm6, %mm6
+
+ paddd %mm0, %mm6
+ pmuludq %mm0, %mm0 C inv*inv
+
+ C
+
+ pmuludq %mm7, %mm0 C inv*inv*d
+ paddd %mm6, %mm6 C 2*inv
+
+
+ movl PARAM_SRC, %eax
+ movl PARAM_SIZE, %ecx
+
+ C
+
+ psubd %mm0, %mm6 C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ movd %mm6, %eax
+ imul PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ pxor %mm0, %mm0 C carry bit
+
+
+C The dependent chain here is as follows.
+C
+C latency
+C psubq s = (src-cbit) - climb 2
+C pmuludq q = s*inverse 8
+C pmuludq prod = q*divisor 8
+C psrlq climb = high(prod) 2
+C --
+C 20
+C
+C Yet the loop measures 19.0 c/l, so obviously there's something gained
+C there over a straight reading of the chip documentation.
+
+L(top):
+ C eax src, incrementing
+ C ebx
+ C ecx counter, limbs
+ C edx
+ C
+ C mm0 carry bit
+ C mm1 carry limb
+ C mm6 inverse
+ C mm7 divisor
+
+ movd (%eax), %mm2
+ addl $4, %eax
+
+ psubq %mm0, %mm2 C src - cbit
+
+ psubq %mm1, %mm2 C src - cbit - climb
+ movq %mm2, %mm0
+ psrlq $63, %mm0 C new cbit
+
+ pmuludq %mm6, %mm2 C s*inverse
+
+ movq %mm7, %mm1
+ pmuludq %mm2, %mm1 C q*divisor
+ psrlq $32, %mm1 C new climb
+
+ subl $1, %ecx
+ jnz L(top)
+
+
+L(done):
+ paddq %mm1, %mm0
+ movd %mm0, %eax
+ emms
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm
new file mode 100644
index 0000000..6347b8b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_1.asm
@@ -0,0 +1,164 @@
+dnl mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) 4.17
+C P6 model 13 (Dothan) 4.17
+C P4 model 0-1 (Willamette) 4
+C P4 model 2 (Northwood) 4
+C P4 model 3-4 (Prescott) 4.55
+
+C TODO:
+C * Tweak eax/edx offsets in loop as to save some lea's
+C * Perhaps software pipeline small-case code
+
+C INPUT PARAMETERS
+C rp sp + 4
+C up sp + 8
+C n sp + 12
+C v0 sp + 16
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_1)
+ pxor %mm6, %mm6
+L(ent): mov 4(%esp), %edx
+ mov 8(%esp), %eax
+ mov 12(%esp), %ecx
+ movd 16(%esp), %mm7
+ cmp $4, %ecx
+ jnc L(big)
+
+L(lp0): movd (%eax), %mm0
+ lea 4(%eax), %eax
+ lea 4(%edx), %edx
+ pmuludq %mm7, %mm0
+ paddq %mm0, %mm6
+ movd %mm6, -4(%edx)
+ psrlq $32, %mm6
+ dec %ecx
+ jnz L(lp0)
+ movd %mm6, %eax
+ emms
+ ret
+
+L(big): and $3, %ecx
+ je L(0)
+ cmp $2, %ecx
+ jc L(1)
+ je L(2)
+ jmp L(3) C FIXME: one case should fall through
+
+L(0): movd (%eax), %mm3
+ sub 12(%esp), %ecx C loop count
+ lea -16(%eax), %eax
+ lea -12(%edx), %edx
+ pmuludq %mm7, %mm3
+ movd 20(%eax), %mm0
+ pmuludq %mm7, %mm0
+ movd 24(%eax), %mm1
+ jmp L(00)
+
+L(1): movd (%eax), %mm2
+ sub 12(%esp), %ecx
+ lea -12(%eax), %eax
+ lea -8(%edx), %edx
+ pmuludq %mm7, %mm2
+ movd 16(%eax), %mm3
+ pmuludq %mm7, %mm3
+ movd 20(%eax), %mm0
+ jmp L(01)
+
+L(2): movd (%eax), %mm1
+ sub 12(%esp), %ecx
+ lea -8(%eax), %eax
+ lea -4(%edx), %edx
+ pmuludq %mm7, %mm1
+ movd 12(%eax), %mm2
+ pmuludq %mm7, %mm2
+ movd 16(%eax), %mm3
+ jmp L(10)
+
+L(3): movd (%eax), %mm0
+ sub 12(%esp), %ecx
+ lea -4(%eax), %eax
+ pmuludq %mm7, %mm0
+ movd 8(%eax), %mm1
+ pmuludq %mm7, %mm1
+ movd 12(%eax), %mm2
+
+ ALIGN(16)
+L(top): pmuludq %mm7, %mm2
+ paddq %mm0, %mm6
+ movd 16(%eax), %mm3
+ movd %mm6, 0(%edx)
+ psrlq $32, %mm6
+L(10): pmuludq %mm7, %mm3
+ paddq %mm1, %mm6
+ movd 20(%eax), %mm0
+ movd %mm6, 4(%edx)
+ psrlq $32, %mm6
+L(01): pmuludq %mm7, %mm0
+ paddq %mm2, %mm6
+ movd 24(%eax), %mm1
+ movd %mm6, 8(%edx)
+ psrlq $32, %mm6
+L(00): pmuludq %mm7, %mm1
+ paddq %mm3, %mm6
+ movd 28(%eax), %mm2
+ movd %mm6, 12(%edx)
+ psrlq $32, %mm6
+ lea 16(%eax), %eax
+ lea 16(%edx), %edx
+ add $4, %ecx
+ ja L(top)
+
+L(end): pmuludq %mm7, %mm2
+ paddq %mm0, %mm6
+ movd %mm6, 0(%edx)
+ psrlq $32, %mm6
+ paddq %mm1, %mm6
+ movd %mm6, 4(%edx)
+ psrlq $32, %mm6
+ paddq %mm2, %mm6
+ movd %mm6, 8(%edx)
+ psrlq $32, %mm6
+ movd %mm6, %eax
+ emms
+ ret
+EPILOGUE()
+PROLOGUE(mpn_mul_1c)
+ movd 20(%esp), %mm6
+ jmp L(ent)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm
new file mode 100644
index 0000000..6e3775a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/mul_basecase.asm
@@ -0,0 +1,662 @@
+dnl mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C * Improve ad-hoc outer loop code and register handling. Some feed-in
+C scheduling could improve things by several cycles per outer iteration.
+C * In code for un <= 3, try keeping accumulation operands in registers,
+C without storing intermediates to rp.
+C * We might want to keep 32 in a free mm register, since the register form is
+C 3 bytes and the immediate form is 4 bytes. About 70 bytes to save.
+C * Look into different loop alignment, we now expand the code about 50 bytes
+C with possibly needless alignment.
+C * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry.
+C * Use OSP, should solve feed-in latency problems.
+C * Save a few tens of bytes by doing cross-jumping for Loel0, etc.
+C * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers
+C so that they can share feed-in code, and changing the branch targets from
+C L<n> to Lm<nn>.
+
+C cycles/limb
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 5.24
+C P6 model 14 (Yonah) ?
+C P4 model 0-1 (Willamette): 5
+C P4 model 2 (Northwood): 4.60 at 32 limbs
+C P4 model 3-4 (Prescott): 4.94 at 32 limbs
+
+C INPUT PARAMETERS
+C rp sp + 4
+C up sp + 8
+C un sp + 12
+C vp sp + 16
+C vn sp + 20
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+ push %esi
+ push %ebx
+ mov 12(%esp), %edx C rp
+ mov 16(%esp), %eax C up
+ mov 20(%esp), %ecx C un
+ mov 24(%esp), %esi C vp
+ mov 28(%esp), %ebx C vn
+ movd (%esi), %mm7 C
+L(ent): cmp $3, %ecx
+ ja L(big)
+ movd (%eax), %mm6
+ pmuludq %mm7, %mm6
+ jz L(un3)
+ cmp $2, %ecx
+ jz L(un2)
+
+L(un1): movd %mm6, (%edx) C un=1
+ psrlq $32, %mm6 C un=1
+ movd %mm6, 4(%edx) C un=1
+ jmp L(rtr) C un=1
+
+L(un2): movd 4(%eax), %mm1 C un=2
+ pmuludq %mm7, %mm1 C un=2
+ movd %mm6, (%edx) C un=2
+ psrlq $32, %mm6 C un=2
+ paddq %mm1, %mm6 C un=2
+ movd %mm6, 4(%edx) C un=2
+ psrlq $32, %mm6 C un=2
+ movd %mm6, 8(%edx) C un=2
+ dec %ebx C un=2
+ jz L(rtr) C un=2
+ movd 4(%esi), %mm7 C un=2
+ movd (%eax), %mm6 C un=2
+ pmuludq %mm7, %mm6 C un=2
+ movd 4(%eax), %mm1 C un=2
+ movd 4(%edx), %mm4 C un=2
+ pmuludq %mm7, %mm1 C un=2
+ movd 8(%edx), %mm5 C un=2
+ paddq %mm4, %mm6 C un=2
+ paddq %mm1, %mm5 C un=2
+ movd %mm6, 4(%edx) C un=2
+ psrlq $32, %mm6 C un=2
+ paddq %mm5, %mm6 C un=2
+ movd %mm6, 8(%edx) C un=2
+ psrlq $32, %mm6 C un=2
+ movd %mm6, 12(%edx) C un=2
+L(rtr): emms
+ pop %ebx
+ pop %esi
+ ret
+
+L(un3): movd 4(%eax), %mm1 C un=3
+ pmuludq %mm7, %mm1 C un=3
+ movd 8(%eax), %mm2 C un=3
+ pmuludq %mm7, %mm2 C un=3
+ movd %mm6, (%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ paddq %mm1, %mm6 C un=3
+ movd %mm6, 4(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ paddq %mm2, %mm6 C un=3
+ movd %mm6, 8(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ movd %mm6, 12(%edx) C un=3
+ dec %ebx C un=3
+ jz L(rtr) C un=3
+ movd 4(%esi), %mm7 C un=3
+ movd (%eax), %mm6 C un=3
+ pmuludq %mm7, %mm6 C un=3
+ movd 4(%eax), %mm1 C un=3
+ movd 4(%edx), %mm4 C un=3
+ pmuludq %mm7, %mm1 C un=3
+ movd 8(%eax), %mm2 C un=3
+ movd 8(%edx), %mm5 C un=3
+ pmuludq %mm7, %mm2 C un=3
+ paddq %mm4, %mm6 C un=3
+ paddq %mm1, %mm5 C un=3
+ movd 12(%edx), %mm4 C un=3
+ movd %mm6, 4(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ paddq %mm5, %mm6 C un=3
+ paddq %mm2, %mm4 C un=3
+ movd %mm6, 8(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ paddq %mm4, %mm6 C un=3
+ movd %mm6, 12(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ movd %mm6, 16(%edx) C un=3
+ dec %ebx C un=3
+ jz L(rtr) C un=3
+ movd 8(%esi), %mm7 C un=3
+ movd (%eax), %mm6 C un=3
+ pmuludq %mm7, %mm6 C un=3
+ movd 4(%eax), %mm1 C un=3
+ movd 8(%edx), %mm4 C un=3
+ pmuludq %mm7, %mm1 C un=3
+ movd 8(%eax), %mm2 C un=3
+ movd 12(%edx), %mm5 C un=3
+ pmuludq %mm7, %mm2 C un=3
+ paddq %mm4, %mm6 C un=3
+ paddq %mm1, %mm5 C un=3
+ movd 16(%edx), %mm4 C un=3
+ movd %mm6, 8(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ paddq %mm5, %mm6 C un=3
+ paddq %mm2, %mm4 C un=3
+ movd %mm6, 12(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ paddq %mm4, %mm6 C un=3
+ movd %mm6, 16(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ movd %mm6, 20(%edx) C un=3
+ jmp L(rtr)
+
+
+L(big): push %edi
+ pxor %mm6, %mm6
+ lea 4(%esi), %esi
+ and $3, %ecx
+ jz L(0)
+ cmp $2, %ecx
+ jc L(1)
+ jz L(2)
+ jmp L(3) C FIXME: one case should fall through
+
+
+L(0): movd (%eax), %mm3 C m 0
+ sub 24(%esp), %ecx C inner loop count m 0
+ mov %ecx, 24(%esp) C update loop count for later m 0
+ pmuludq %mm7, %mm3 C m 0
+ movd 4(%eax), %mm0 C m 0
+ pmuludq %mm7, %mm0 C m 0
+ movd 8(%eax), %mm1 C m 0
+ jmp L(m00) C m 0
+ ALIGN(16) C m 0
+L(lpm0):
+ pmuludq %mm7, %mm4 C m 0
+ paddq %mm0, %mm6 C m 0
+ movd (%eax), %mm3 C m 0
+ movd %mm6, -12(%edx) C m 0
+ psrlq $32, %mm6 C m 0
+ pmuludq %mm7, %mm3 C m 0
+ paddq %mm1, %mm6 C m 0
+ movd 4(%eax), %mm0 C m 0
+ movd %mm6, -8(%edx) C m 0
+ psrlq $32, %mm6 C m 0
+ pmuludq %mm7, %mm0 C m 0
+ paddq %mm4, %mm6 C m 0
+ movd 8(%eax), %mm1 C m 0
+ movd %mm6, -4(%edx) C m 0
+ psrlq $32, %mm6 C m 0
+L(m00): pmuludq %mm7, %mm1 C m 0
+ paddq %mm3, %mm6 C m 0
+ movd 12(%eax), %mm4 C m 0
+ movd %mm6, (%edx) C m 0
+ psrlq $32, %mm6 C m 0
+ lea 16(%eax), %eax C m 0
+ lea 16(%edx), %edx C m 0
+ add $4, %ecx C m 0
+ ja L(lpm0) C m 0
+ pmuludq %mm7, %mm4 C m 0
+ paddq %mm0, %mm6 C m 0
+ movd %mm6, -12(%edx) C m 0
+ psrlq $32, %mm6 C m 0
+ paddq %mm1, %mm6 C m 0
+ mov 16(%esp), %edi C rp 0
+ jmp L(x0)
+
+L(olp0):
+ lea 4(%edi), %edi C am 0
+ movd (%esi), %mm7 C am 0
+ lea 4(%esi), %esi C am 0
+ mov %edi, %edx C rp am 0
+ mov 20(%esp), %eax C up am 0
+ movd (%eax), %mm3 C am 0
+ mov 24(%esp), %ecx C inner loop count am 0
+ pxor %mm6, %mm6 C am 0
+ pmuludq %mm7, %mm3 C am 0
+ movd 4(%eax), %mm0 C am 0
+ movd (%edx), %mm5 C am 0
+ pmuludq %mm7, %mm0 C am 0
+ movd 8(%eax), %mm1 C am 0
+ paddq %mm3, %mm5 C am 0
+ movd 4(%edx), %mm4 C am 0
+ jmp L(am00) C am 0
+ ALIGN(16) C mm 0
+L(lam0):
+ pmuludq %mm7, %mm2 C am 0
+ paddq %mm4, %mm6 C am 0
+ movd (%eax), %mm3 C am 0
+ paddq %mm1, %mm5 C am 0
+ movd -4(%edx), %mm4 C am 0
+ movd %mm6, -12(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ pmuludq %mm7, %mm3 C am 0
+ paddq %mm5, %mm6 C am 0
+ movd 4(%eax), %mm0 C am 0
+ paddq %mm2, %mm4 C am 0
+ movd (%edx), %mm5 C am 0
+ movd %mm6, -8(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ pmuludq %mm7, %mm0 C am 0
+ paddq %mm4, %mm6 C am 0
+ movd 8(%eax), %mm1 C am 0
+ paddq %mm3, %mm5 C am 0
+ movd 4(%edx), %mm4 C am 0
+ movd %mm6, -4(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+L(am00):
+ pmuludq %mm7, %mm1 C am 0
+ paddq %mm5, %mm6 C am 0
+ movd 12(%eax), %mm2 C am 0
+ paddq %mm0, %mm4 C am 0
+ movd 8(%edx), %mm5 C am 0
+ movd %mm6, (%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ lea 16(%eax), %eax C am 0
+ lea 16(%edx), %edx C am 0
+ add $4, %ecx C am 0
+ jnz L(lam0) C am 0
+ pmuludq %mm7, %mm2 C am 0
+ paddq %mm4, %mm6 C am 0
+ paddq %mm1, %mm5 C am 0
+ movd -4(%edx), %mm4 C am 0
+ movd %mm6, -12(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ paddq %mm5, %mm6 C am 0
+ paddq %mm2, %mm4 C am 0
+L(x0): movd %mm6, -8(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ paddq %mm4, %mm6 C am 0
+ movd %mm6, -4(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ movd %mm6, (%edx) C am 0
+ dec %ebx C am 0
+ jnz L(olp0) C am 0
+L(oel0):
+ emms C 0
+ pop %edi C 0
+ pop %ebx C 0
+ pop %esi C 0
+ ret C 0
+
+
+L(1): movd (%eax), %mm4 C m 1
+ sub 24(%esp), %ecx C m 1
+ mov %ecx, 24(%esp) C update loop count for later m 1
+ pmuludq %mm7, %mm4 C m 1
+ movd 4(%eax), %mm3 C m 1
+ pmuludq %mm7, %mm3 C m 1
+ movd 8(%eax), %mm0 C m 1
+ jmp L(m01) C m 1
+ ALIGN(16) C m 1
+L(lpm1):
+ pmuludq %mm7, %mm4 C m 1
+ paddq %mm0, %mm6 C m 1
+ movd 4(%eax), %mm3 C m 1
+ movd %mm6, -8(%edx) C m 1
+ psrlq $32, %mm6 C m 1
+ pmuludq %mm7, %mm3 C m 1
+ paddq %mm1, %mm6 C m 1
+ movd 8(%eax), %mm0 C m 1
+ movd %mm6, -4(%edx) C m 1
+ psrlq $32, %mm6 C m 1
+L(m01): pmuludq %mm7, %mm0 C m 1
+ paddq %mm4, %mm6 C m 1
+ movd 12(%eax), %mm1 C m 1
+ movd %mm6, (%edx) C m 1
+ psrlq $32, %mm6 C m 1
+ pmuludq %mm7, %mm1 C m 1
+ paddq %mm3, %mm6 C m 1
+ movd 16(%eax), %mm4 C m 1
+ movd %mm6, 4(%edx) C m 1
+ psrlq $32, %mm6 C m 1
+ lea 16(%eax), %eax C m 1
+ lea 16(%edx), %edx C m 1
+ add $4, %ecx C m 1
+ ja L(lpm1) C m 1
+ pmuludq %mm7, %mm4 C m 1
+ paddq %mm0, %mm6 C m 1
+ movd %mm6, -8(%edx) C m 1
+ psrlq $32, %mm6 C m 1
+ paddq %mm1, %mm6 C m 1
+ mov 16(%esp), %edi C rp 1
+ jmp L(x1)
+
+L(olp1):
+ lea 4(%edi), %edi C am 1
+ movd (%esi), %mm7 C am 1
+ lea 4(%esi), %esi C am 1
+ mov %edi, %edx C rp am 1
+ mov 20(%esp), %eax C up am 1
+ movd (%eax), %mm2 C am 1
+ mov 24(%esp), %ecx C inner loop count am 1
+ pxor %mm6, %mm6 C am 1
+ pmuludq %mm7, %mm2 C am 1
+ movd 4(%eax), %mm3 C am 1
+ movd (%edx), %mm4 C am 1
+ pmuludq %mm7, %mm3 C am 1
+ movd 8(%eax), %mm0 C am 1
+ paddq %mm2, %mm4 C am 1
+ movd 4(%edx), %mm5 C am 1
+ jmp L(am01) C am 1
+ ALIGN(16) C am 1
+L(lam1):
+ pmuludq %mm7, %mm2 C am 1
+ paddq %mm4, %mm6 C am 1
+ movd 4(%eax), %mm3 C am 1
+ paddq %mm1, %mm5 C am 1
+ movd (%edx), %mm4 C am 1
+ movd %mm6, -8(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ pmuludq %mm7, %mm3 C am 1
+ paddq %mm5, %mm6 C am 1
+ movd 8(%eax), %mm0 C am 1
+ paddq %mm2, %mm4 C am 1
+ movd 4(%edx), %mm5 C am 1
+ movd %mm6, -4(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+L(am01):
+ pmuludq %mm7, %mm0 C am 1
+ paddq %mm4, %mm6 C am 1
+ movd 12(%eax), %mm1 C am 1
+ paddq %mm3, %mm5 C am 1
+ movd 8(%edx), %mm4 C am 1
+ movd %mm6, (%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ pmuludq %mm7, %mm1 C am 1
+ paddq %mm5, %mm6 C am 1
+ movd 16(%eax), %mm2 C am 1
+ paddq %mm0, %mm4 C am 1
+ movd 12(%edx), %mm5 C am 1
+ movd %mm6, 4(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ lea 16(%eax), %eax C am 1
+ lea 16(%edx), %edx C am 1
+ add $4, %ecx C am 1
+ jnz L(lam1) C am 1
+ pmuludq %mm7, %mm2 C am 1
+ paddq %mm4, %mm6 C am 1
+ paddq %mm1, %mm5 C am 1
+ movd (%edx), %mm4 C am 1
+ movd %mm6, -8(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ paddq %mm5, %mm6 C am 1
+ paddq %mm2, %mm4 C am 1
+L(x1): movd %mm6, -4(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ paddq %mm4, %mm6 C am 1
+ movd %mm6, (%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ movd %mm6, 4(%edx) C am 1
+ dec %ebx C am 1
+ jnz L(olp1) C am 1
+L(oel1):
+ emms C 1
+ pop %edi C 1
+ pop %ebx C 1
+ pop %esi C 1
+ ret C 1
+
+
+L(2): movd (%eax), %mm1 C m 2
+ sub 24(%esp), %ecx C m 2
+ mov %ecx, 24(%esp) C update loop count for later m 2
+ pmuludq %mm7, %mm1 C m 2
+ movd 4(%eax), %mm4 C m 2
+ pmuludq %mm7, %mm4 C m 2
+ movd 8(%eax), %mm3 C m 2
+ jmp L(m10) C m 2
+ ALIGN(16) C m 2
+L(lpm2):
+ pmuludq %mm7, %mm4 C m 2
+ paddq %mm0, %mm6 C m 2
+ movd 8(%eax), %mm3 C m 2
+ movd %mm6, -4(%edx) C m 2
+ psrlq $32, %mm6 C m 2
+L(m10): pmuludq %mm7, %mm3 C m 2
+ paddq %mm1, %mm6 C m 2
+ movd 12(%eax), %mm0 C m 2
+ movd %mm6, (%edx) C m 2
+ psrlq $32, %mm6 C m 2
+ pmuludq %mm7, %mm0 C m 2
+ paddq %mm4, %mm6 C m 2
+ movd 16(%eax), %mm1 C m 2
+ movd %mm6, 4(%edx) C m 2
+ psrlq $32, %mm6 C m 2
+ pmuludq %mm7, %mm1 C m 2
+ paddq %mm3, %mm6 C m 2
+ movd 20(%eax), %mm4 C m 2
+ movd %mm6, 8(%edx) C m 2
+ psrlq $32, %mm6 C m 2
+ lea 16(%eax), %eax C m 2
+ lea 16(%edx), %edx C m 2
+ add $4, %ecx C m 2
+ ja L(lpm2) C m 2
+ pmuludq %mm7, %mm4 C m 2
+ paddq %mm0, %mm6 C m 2
+ movd %mm6, -4(%edx) C m 2
+ psrlq $32, %mm6 C m 2
+ paddq %mm1, %mm6 C m 2
+ mov 16(%esp), %edi C rp 2
+ jmp L(x2)
+
+L(olp2):
+ lea 4(%edi), %edi C am 2
+ movd (%esi), %mm7 C am 2
+ lea 4(%esi), %esi C am 2
+ mov %edi, %edx C rp am 2
+ mov 20(%esp), %eax C up am 2
+ movd (%eax), %mm1 C am 2
+ mov 24(%esp), %ecx C inner loop count am 2
+ pxor %mm6, %mm6 C am 2
+ pmuludq %mm7, %mm1 C am 2
+ movd 4(%eax), %mm2 C am 2
+ movd (%edx), %mm5 C am 2
+ pmuludq %mm7, %mm2 C am 2
+ movd 8(%eax), %mm3 C am 2
+ paddq %mm1, %mm5 C am 2
+ movd 4(%edx), %mm4 C am 2
+ jmp L(am10) C am 2
+ ALIGN(16) C am 2
+L(lam2):
+ pmuludq %mm7, %mm2 C am 2
+ paddq %mm4, %mm6 C am 2
+ movd 8(%eax), %mm3 C am 2
+ paddq %mm1, %mm5 C am 2
+ movd 4(%edx), %mm4 C am 2
+ movd %mm6, -4(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+L(am10):
+ pmuludq %mm7, %mm3 C am 2
+ paddq %mm5, %mm6 C am 2
+ movd 12(%eax), %mm0 C am 2
+ paddq %mm2, %mm4 C am 2
+ movd 8(%edx), %mm5 C am 2
+ movd %mm6, (%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ pmuludq %mm7, %mm0 C am 2
+ paddq %mm4, %mm6 C am 2
+ movd 16(%eax), %mm1 C am 2
+ paddq %mm3, %mm5 C am 2
+ movd 12(%edx), %mm4 C am 2
+ movd %mm6, 4(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ pmuludq %mm7, %mm1 C am 2
+ paddq %mm5, %mm6 C am 2
+ movd 20(%eax), %mm2 C am 2
+ paddq %mm0, %mm4 C am 2
+ movd 16(%edx), %mm5 C am 2
+ movd %mm6, 8(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ lea 16(%eax), %eax C am 2
+ lea 16(%edx), %edx C am 2
+ add $4, %ecx C am 2
+ jnz L(lam2) C am 2
+ pmuludq %mm7, %mm2 C am 2
+ paddq %mm4, %mm6 C am 2
+ paddq %mm1, %mm5 C am 2
+ movd 4(%edx), %mm4 C am 2
+ movd %mm6, -4(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ paddq %mm5, %mm6 C am 2
+ paddq %mm2, %mm4 C am 2
+L(x2): movd %mm6, (%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ paddq %mm4, %mm6 C am 2
+ movd %mm6, 4(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ movd %mm6, 8(%edx) C am 2
+ dec %ebx C am 2
+ jnz L(olp2) C am 2
+L(oel2):
+ emms C 2
+ pop %edi C 2
+ pop %ebx C 2
+ pop %esi C 2
+ ret C 2
+
+
+L(3): movd (%eax), %mm0 C m 3
+ sub 24(%esp), %ecx C m 3
+ mov %ecx, 24(%esp) C update loop count for later m 3
+ pmuludq %mm7, %mm0 C m 3
+ movd 4(%eax), %mm1 C m 3
+ pmuludq %mm7, %mm1 C m 3
+ movd 8(%eax), %mm4 C m 3
+ jmp L(lpm3) C m 3
+ ALIGN(16) C m 3
+L(lpm3):
+ pmuludq %mm7, %mm4 C m 3
+ paddq %mm0, %mm6 C m 3
+ movd 12(%eax), %mm3 C m 3
+ movd %mm6, (%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ pmuludq %mm7, %mm3 C m 3
+ paddq %mm1, %mm6 C m 3
+ movd 16(%eax), %mm0 C m 3
+ movd %mm6, 4(%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ pmuludq %mm7, %mm0 C m 3
+ paddq %mm4, %mm6 C m 3
+ movd 20(%eax), %mm1 C m 3
+ movd %mm6, 8(%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ pmuludq %mm7, %mm1 C m 3
+ paddq %mm3, %mm6 C m 3
+ movd 24(%eax), %mm4 C m 3
+ movd %mm6, 12(%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ lea 16(%eax), %eax C m 3
+ lea 16(%edx), %edx C m 3
+ add $4, %ecx C m 3
+ ja L(lpm3) C m 3
+ pmuludq %mm7, %mm4 C m 3
+ paddq %mm0, %mm6 C m 3
+ movd %mm6, (%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ paddq %mm1, %mm6 C m 3
+ mov 16(%esp), %edi C rp 3
+ jmp L(x3)
+
+L(olp3):
+ lea 4(%edi), %edi C am 3
+ movd (%esi), %mm7 C am 3
+ lea 4(%esi), %esi C am 3
+ mov %edi, %edx C rp am 3
+ mov 20(%esp), %eax C up am 3
+ movd (%eax), %mm0 C am 3
+ mov 24(%esp), %ecx C inner loop count am 3
+ pxor %mm6, %mm6 C am 3
+ pmuludq %mm7, %mm0 C am 3
+ movd 4(%eax), %mm1 C am 3
+ movd (%edx), %mm4 C am 3
+ pmuludq %mm7, %mm1 C am 3
+ movd 8(%eax), %mm2 C am 3
+ paddq %mm0, %mm4 C am 3
+ movd 4(%edx), %mm5 C am 3
+ jmp L(lam3) C am 3
+ ALIGN(16) C am 3
+L(lam3):
+ pmuludq %mm7, %mm2 C am 3
+ paddq %mm4, %mm6 C am 3
+ movd 12(%eax), %mm3 C am 3
+ paddq %mm1, %mm5 C am 3
+ movd 8(%edx), %mm4 C am 3
+ movd %mm6, (%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ pmuludq %mm7, %mm3 C am 3
+ paddq %mm5, %mm6 C am 3
+ movd 16(%eax), %mm0 C am 3
+ paddq %mm2, %mm4 C am 3
+ movd 12(%edx), %mm5 C am 3
+ movd %mm6, 4(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ pmuludq %mm7, %mm0 C am 3
+ paddq %mm4, %mm6 C am 3
+ movd 20(%eax), %mm1 C am 3
+ paddq %mm3, %mm5 C am 3
+ movd 16(%edx), %mm4 C am 3
+ movd %mm6, 8(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ pmuludq %mm7, %mm1 C am 3
+ paddq %mm5, %mm6 C am 3
+ movd 24(%eax), %mm2 C am 3
+ paddq %mm0, %mm4 C am 3
+ movd 20(%edx), %mm5 C am 3
+ movd %mm6, 12(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ lea 16(%eax), %eax C am 3
+ lea 16(%edx), %edx C am 3
+ add $4, %ecx C am 3
+ jnz L(lam3) C am 3
+ pmuludq %mm7, %mm2 C am 3
+ paddq %mm4, %mm6 C am 3
+ paddq %mm1, %mm5 C am 3
+ movd 8(%edx), %mm4 C am 3
+ movd %mm6, (%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ paddq %mm5, %mm6 C am 3
+ paddq %mm2, %mm4 C am 3
+L(x3): movd %mm6, 4(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ paddq %mm4, %mm6 C am 3
+ movd %mm6, 8(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ movd %mm6, 12(%edx) C am 3
+ dec %ebx C am 3
+ jnz L(olp3) C am 3
+L(oel3):
+ emms C 3
+ pop %edi C 3
+ pop %ebx C 3
+ pop %esi C 3
+ ret C 3
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm
new file mode 100644
index 0000000..c7f4426
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm
@@ -0,0 +1,281 @@
+dnl X86-32 and X86-64 mpn_popcount using SSE2.
+
+dnl Copyright 2006, 2007, 2011, 2015, 2020 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+
+C 32-bit popcount hamdist
+C cycles/limb cycles/limb
+C P5 -
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) 4
+C P4 model 0 (Willamette) ?
+C P4 model 1 (?) ?
+C P4 model 2 (Northwood) 3.9
+C P4 model 3 (Prescott) ?
+C P4 model 4 (Nocona) ?
+C AMD K6 -
+C AMD K7 -
+C AMD K8 ?
+
+C 64-bit popcount hamdist
+C cycles/limb cycles/limb
+C P4 model 4 (Nocona): 8
+C AMD K8,K9 7.5
+C AMD K10 3.5
+C Intel core2 3.68
+C Intel corei 3.15
+C Intel atom 10.8
+C VIA nano 6.5
+
+C TODO
+C * Make an mpn_hamdist based on this. Alignment could either be handled by
+C using movdqu for one operand and movdqa for the other, or by painfully
+C shifting as we go. Unfortunately, there seem to be no usable shift
+C instruction, except for one that takes an immediate count.
+C * It would probably be possible to cut a few cycles/limb using software
+C pipelining.
+C * There are 35 decode slots unused by the SSE2 instructions. Loop control
+C needs just 2 or 3 slots, leaving around 32 slots. This allows a parallel
+C integer based popcount. Such a combined loop would handle 6 limbs in
+C about 30 cycles on K8.
+C * We could save a byte or two by using 32-bit operations on areg.
+C * Check if using movdqa to a temp of and then register-based pand is faster.
+
+ifelse(GMP_LIMB_BITS,`32',
+` define(`up', `%edx')
+ define(`n', `%ecx')
+ define(`areg',`%eax')
+ define(`breg',`%ebx')
+ define(`zero',`%xmm4')
+ define(`LIMB32',` $1')
+ define(`LIMB64',`dnl')
+',`
+ define(`up', `%rdi')
+ define(`n', `%rsi')
+ define(`areg',`%rax')
+ define(`breg',`%rdx')
+ define(`zero',`%xmm8')
+ define(`LIMB32',`dnl')
+ define(`LIMB64',` $1')
+')
+
+define(`mm01010101',`%xmm6')
+define(`mm00110011',`%xmm7')
+define(`mm00001111',`%xmm2')
+
+define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
+define(`LIMBS_PER_XMM', eval(16/GMP_LIMB_BYTES))
+define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
+
+undefine(`psadbw') C override inherited m4 version
+
+C This file is shared between 32-bit and 64-bit builds. Only the former has
+C LEAL. Default LEAL as an alias of LEA.
+ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')')
+
+ASM_START()
+
+C Make cnsts global to work around Apple relocation bug.
+ifdef(`DARWIN',`
+ define(`cnsts', MPN(popccnsts))
+ GLOBL cnsts')
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_popcount)
+
+LIMB32(`mov 4(%esp), up ')
+LIMB32(`mov 8(%esp), n ')
+LIMB32(`push %ebx ')
+
+ pxor %xmm3, %xmm3 C zero grand total count
+LIMB64(`pxor zero, zero ')
+
+ LEAL( cnsts, breg)
+
+ movdqa -48(breg), mm01010101
+ movdqa -32(breg), mm00110011
+ movdqa -16(breg), mm00001111
+
+ mov up, areg
+ and $-16, up C round `up' down to 128-bit boundary
+ and $12, areg C 32:areg = 0, 4, 8, 12
+ C 64:areg = 0, 8
+ movdqa (up), %xmm0
+ pand 64(breg,areg,4), %xmm0
+ shr $m4_log2(GMP_LIMB_BYTES), %eax
+ add areg, n C compensate n for rounded down `up'
+
+ pxor %xmm4, %xmm4
+ sub $LIMBS_PER_XMM, n
+ jbe L(sum)
+
+ sub $LIMBS_PER_XMM, n
+ ja L(ent)
+ jmp L(lsum)
+
+ ALIGN(16)
+L(top): movdqa (up), %xmm0
+L(ent): movdqa 16(up), %xmm4
+
+ movdqa %xmm0, %xmm1
+ movdqa %xmm4, %xmm5
+ psrld $1, %xmm0
+ psrld $1, %xmm4
+ pand mm01010101, %xmm0
+ pand mm01010101, %xmm4
+ psubd %xmm0, %xmm1
+ psubd %xmm4, %xmm5
+
+ movdqa %xmm1, %xmm0
+ movdqa %xmm5, %xmm4
+ psrlq $2, %xmm1
+ psrlq $2, %xmm5
+ pand mm00110011, %xmm0
+ pand mm00110011, %xmm4
+ pand mm00110011, %xmm1
+ pand mm00110011, %xmm5
+ paddq %xmm0, %xmm1
+ paddq %xmm4, %xmm5
+
+LIMB32(`pxor zero, zero ')
+
+ add $32, up
+ sub $LIMBS_PER_2XMM, n
+
+ paddq %xmm5, %xmm1
+ movdqa %xmm1, %xmm0
+ psrlq $4, %xmm1
+ pand mm00001111, %xmm0
+ pand mm00001111, %xmm1
+ paddq %xmm0, %xmm1
+
+ psadbw zero, %xmm1
+ paddq %xmm1, %xmm3 C add to grand total
+
+ jnc L(top)
+L(end):
+ add $LIMBS_PER_2XMM, n
+ jz L(rt)
+ movdqa (up), %xmm0
+ pxor %xmm4, %xmm4
+ sub $LIMBS_PER_XMM, n
+ jbe L(sum)
+L(lsum):
+ movdqa %xmm0, %xmm4
+ movdqa 16(up), %xmm0
+L(sum):
+ shl $m4_log2(GMP_LIMB_BYTES), n
+ and $12, n
+ pand (breg,n,4), %xmm0
+
+ movdqa %xmm0, %xmm1
+ movdqa %xmm4, %xmm5
+ psrld $1, %xmm0
+ psrld $1, %xmm4
+ pand mm01010101, %xmm0
+ pand mm01010101, %xmm4
+ psubd %xmm0, %xmm1
+ psubd %xmm4, %xmm5
+
+ movdqa %xmm1, %xmm0
+ movdqa %xmm5, %xmm4
+ psrlq $2, %xmm1
+ psrlq $2, %xmm5
+ pand mm00110011, %xmm0
+ pand mm00110011, %xmm4
+ pand mm00110011, %xmm1
+ pand mm00110011, %xmm5
+ paddq %xmm0, %xmm1
+ paddq %xmm4, %xmm5
+
+LIMB32(`pxor zero, zero ')
+
+ paddq %xmm5, %xmm1
+ movdqa %xmm1, %xmm0
+ psrlq $4, %xmm1
+ pand mm00001111, %xmm0
+ pand mm00001111, %xmm1
+ paddq %xmm0, %xmm1
+
+ psadbw zero, %xmm1
+ paddq %xmm1, %xmm3 C add to grand total
+
+
+C Add the two 64-bit halves of the grand total counter
+L(rt): movdqa %xmm3, %xmm0
+ psrldq $8, %xmm3
+ paddq %xmm3, %xmm0
+ movd %xmm0, areg C movq avoided due to gas bug
+
+LIMB32(`pop %ebx ')
+ ret
+
+EPILOGUE()
+DEF_OBJECT(dummy,16)
+C Three magic constants used for masking out bits
+ .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
+ .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
+
+ .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
+ .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
+
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+cnsts:
+C Masks for high end of number
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+ .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+ .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+C Masks for low end of number
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+ .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+ .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+ .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+END_OBJECT(dummy)
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm
new file mode 100644
index 0000000..f421d13
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/rsh1add_n.asm
@@ -0,0 +1,126 @@
+dnl Intel Pentium-4 mpn_rsh1add_n -- mpn (x+y)/2
+
+dnl Copyright 2001-2004 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb (approx)
+C dst!=src1,2 dst==src1 dst==src2
+C P4: 4.5 6.5 6.5
+
+
+C mp_limb_t mpn_rsh1add_n (mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
+C mp_size_t size);
+C
+C The slightly strange combination of indexing and pointer incrementing
+C that's used seems to work best. Not sure why, but for instance leal
+C incrementing on %esi is a 1 or 2 cycle slowdown.
+C
+C The dependent chain is paddq combining the carry and next (shifted) part,
+C plus psrlq to move the new carry down. That, and just 4 mmx instructions
+C in total, makes 4 c/l the target speed, which is almost achieved for
+C separate src/dst but when src==dst the write combining anomalies slow it
+C down.
+
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_YP, 12)
+defframe(PARAM_XP, 8)
+defframe(PARAM_WP, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX,`PARAM_XP')
+define(SAVE_ESI,`PARAM_YP')
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_rsh1add_n)
+deflit(`FRAME',0)
+
+ movl PARAM_XP, %edx
+ movl %ebx, SAVE_EBX
+
+ movl PARAM_YP, %ebx
+ movl %esi, SAVE_ESI
+
+ movl PARAM_WP, %esi
+
+ movd (%edx), %mm0 C xp[0]
+
+ movd (%ebx), %mm1 C yp[0]
+ movl PARAM_SIZE, %ecx
+
+ movl (%edx), %eax C xp[0]
+
+ addl (%ebx), %eax C xp[0]+yp[0]
+
+ paddq %mm1, %mm0 C xp[0]+yp[0]
+ leal (%esi,%ecx,4), %esi C wp end
+ negl %ecx C -size
+
+ psrlq $1, %mm0 C (xp[0]+yp[0])/2
+ and $1, %eax C return value, rsh1 bit of xp[0]+yp[0]
+ addl $1, %ecx C -(size-1)
+ jz L(done)
+
+
+L(top):
+ C eax return value
+ C ebx yp end
+ C ecx counter, limbs, -(size-1) to -1 inclusive
+ C edx xp end
+ C esi wp end
+ C mm0 carry (32 bits)
+
+ movd 4(%edx), %mm1 C xp[i+1]
+ movd 4(%ebx), %mm2 C yp[i+1]
+ leal 4(%edx), %edx
+ leal 4(%ebx), %ebx
+ paddq %mm2, %mm1 C xp[i+1]+yp[i+1]
+ psllq $31, %mm1 C low bit at 31, further 32 above
+
+ paddq %mm1, %mm0 C 31 and carry from prev add
+ movd %mm0, -4(%esi,%ecx,4) C low ready to store dst[i]
+
+ psrlq $32, %mm0 C high becomes new carry
+
+ addl $1, %ecx
+ jnz L(top)
+
+
+L(done):
+ movd %mm0, -4(%esi) C dst[size-1]
+ movl SAVE_EBX, %ebx
+
+ movl SAVE_ESI, %esi
+ emms
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm
new file mode 100644
index 0000000..0d548e0
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/sqr_basecase.asm
@@ -0,0 +1,705 @@
+dnl mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C * Improve ad-hoc outer loop code and register handling. Some feed-in
+C scheduling could improve things by several cycles per outer iteration.
+C * In Lam3...Lam1 code for, keep accumulation operands in registers, without
+C storing intermediates to rp.
+C * We might want to keep 32 in a free mm register, since the register form is
+C 3 bytes and the immediate form is 4 bytes. About 80 bytes to save.
+C * Look into different loop alignment, we now expand the code about 50 bytes
+C with possibly needless alignment.
+C * Use OSP, should solve feed-in latency problems.
+C * Address relative slowness for un<=3 for Pentium M. The old code is there
+C considerably faster. (1:20/14, 2:34/32, 3:66/57)
+
+C INPUT PARAMETERS
+C rp sp + 4
+C up sp + 8
+C un sp + 12
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_sqr_basecase)
+ mov 4(%esp), %edx C rp
+ mov 8(%esp), %eax C up
+ mov 12(%esp), %ecx C un
+
+ cmp $2, %ecx
+ jc L(un1)
+ jz L(un2)
+ cmp $4, %ecx
+ jc L(un3)
+ jz L(un4)
+ jmp L(big)
+
+L(un1): mov (%eax), %eax
+ mov %edx, %ecx
+ mul %eax
+ mov %eax, (%ecx)
+ mov %edx, 4(%ecx)
+ ret
+L(un2): movd (%eax), %mm0 C un=2
+ movd (%eax), %mm2 C un=2
+ movd 4(%eax), %mm1 C un=2
+ pmuludq %mm0, %mm0 C 64b weight 0 un=2
+ pmuludq %mm1, %mm2 C 64b weight 32 un=2
+ pmuludq %mm1, %mm1 C 64b weight 64 un=2
+ movd %mm0, (%edx) C un=2
+ psrlq $32, %mm0 C 32b weight 32 un=2
+ pcmpeqd %mm7, %mm7 C un=2
+ psrlq $33, %mm7 C 0x000000007FFFFFFF un=2
+ pand %mm2, %mm7 C 31b weight 32 un=2
+ psrlq $31, %mm2 C 33b weight 65 un=2
+ psllq $1, %mm7 C 31b weight 33 un=2
+ paddq %mm7, %mm0 C un=2
+ movd %mm0, 4(%edx) C un=2
+ psrlq $32, %mm0 C un=2
+ paddq %mm2, %mm1 C un=2
+ paddq %mm0, %mm1 C un=2
+ movd %mm1, 8(%edx) C un=2
+ psrlq $32, %mm1 C un=2
+ movd %mm1, 12(%edx) C un=2
+ emms
+ ret
+L(un3): movd (%eax), %mm7 C un=3
+ movd 4(%eax), %mm6 C un=3
+ pmuludq %mm7, %mm6 C un=3
+ movd 8(%eax), %mm2 C un=3
+ pmuludq %mm7, %mm2 C un=3
+ movd %mm6, 4(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ paddq %mm2, %mm6 C un=3
+ movd %mm6, 8(%edx) C un=3
+ psrlq $32, %mm6 C un=3
+ movd %mm6, 12(%edx) C un=3
+ lea 4(%edx), %edx C un=3
+ lea 4(%eax), %eax C un=3
+ jmp L(am1)
+L(un4): movd (%eax), %mm7 C un=4
+ movd 4(%eax), %mm6 C un=4
+ pmuludq %mm7, %mm6 C un=4
+ movd 8(%eax), %mm0 C un=4
+ pmuludq %mm7, %mm0 C un=4
+ movd 12(%eax), %mm1 C un=4
+ pmuludq %mm7, %mm1 C un=4
+ movd %mm6, 4(%edx) C un=4
+ psrlq $32, %mm6 C un=4
+ paddq %mm0, %mm6 C un=4
+ movd %mm6, 8(%edx) C un=4
+ psrlq $32, %mm6 C un=4
+ paddq %mm1, %mm6 C un=4
+ movd %mm6, 12(%edx) C un=4
+ psrlq $32, %mm6 C un=4
+ movd %mm6, 16(%edx) C un=4
+ lea 4(%edx), %edx C un=4
+ lea 4(%eax), %eax C un=4
+ jmp L(am2)
+
+L(big): push %esi
+ push %ebx
+ push %edi
+ pxor %mm6, %mm6
+ movd (%eax), %mm7 C
+ lea 4(%eax), %esi C init up, up++
+ lea 4(%eax), %eax C up2++ FIXME: should fix offsets
+ lea 4(%edx), %edi C init rp, rp++
+ lea 4(%edx), %edx C rp2++
+ lea -4(%ecx), %ebx C loop count
+ and $3, %ecx
+ jz L(3m)
+ cmp $2, %ecx
+ ja L(2m)
+ jb L(0m)
+
+L(1m):
+ movd (%eax), %mm4 C m 1
+ lea (%ebx), %ecx C inner loop count m 1
+ pmuludq %mm7, %mm4 C m 1
+ movd 4(%eax), %mm3 C m 1
+ pmuludq %mm7, %mm3 C m 1
+ movd 8(%eax), %mm0 C m 1
+ jmp L(m01) C m 1
+ ALIGN(16) C m 1
+L(lpm1):
+ pmuludq %mm7, %mm4 C m 1
+ paddq %mm0, %mm6 C m 1
+ movd 4(%eax), %mm3 C m 1
+ movd %mm6, -8(%edx) C m 1
+ psrlq $32, %mm6 C m 1
+ pmuludq %mm7, %mm3 C m 1
+ paddq %mm1, %mm6 C m 1
+ movd 8(%eax), %mm0 C m 1
+ movd %mm6, -4(%edx) C m 1
+ psrlq $32, %mm6 C m 1
+L(m01): pmuludq %mm7, %mm0 C m 1
+ paddq %mm4, %mm6 C m 1
+ movd 12(%eax), %mm1 C m 1
+ movd %mm6, (%edx) C m 1
+ psrlq $32, %mm6 C m 1
+ pmuludq %mm7, %mm1 C m 1
+ paddq %mm3, %mm6 C m 1
+ movd 16(%eax), %mm4 C m 1
+ movd %mm6, 4(%edx) C m 1
+ psrlq $32, %mm6 C m 1
+ lea 16(%eax), %eax C m 1
+ lea 16(%edx), %edx C m 1
+ sub $4, %ecx C m 1
+ ja L(lpm1) C m 1
+ pmuludq %mm7, %mm4 C m 1
+ paddq %mm0, %mm6 C m 1
+ movd %mm6, -8(%edx) C m 1
+ psrlq $32, %mm6 C m 1
+ paddq %mm1, %mm6 C m 1
+ jmp L(0)
+
+L(2m):
+ movd (%eax), %mm1 C m 2
+ lea (%ebx), %ecx C inner loop count m 2
+ pmuludq %mm7, %mm1 C m 2
+ movd 4(%eax), %mm4 C m 2
+ pmuludq %mm7, %mm4 C m 2
+ movd 8(%eax), %mm3 C m 2
+ jmp L(m10) C m 2
+ ALIGN(16) C m 2
+L(lpm2):
+ pmuludq %mm7, %mm4 C m 2
+ paddq %mm0, %mm6 C m 2
+ movd 8(%eax), %mm3 C m 2
+ movd %mm6, -4(%edx) C m 2
+ psrlq $32, %mm6 C m 2
+L(m10): pmuludq %mm7, %mm3 C m 2
+ paddq %mm1, %mm6 C m 2
+ movd 12(%eax), %mm0 C m 2
+ movd %mm6, (%edx) C m 2
+ psrlq $32, %mm6 C m 2
+ pmuludq %mm7, %mm0 C m 2
+ paddq %mm4, %mm6 C m 2
+ movd 16(%eax), %mm1 C m 2
+ movd %mm6, 4(%edx) C m 2
+ psrlq $32, %mm6 C m 2
+ pmuludq %mm7, %mm1 C m 2
+ paddq %mm3, %mm6 C m 2
+ movd 20(%eax), %mm4 C m 2
+ movd %mm6, 8(%edx) C m 2
+ psrlq $32, %mm6 C m 2
+ lea 16(%eax), %eax C m 2
+ lea 16(%edx), %edx C m 2
+ sub $4, %ecx C m 2
+ ja L(lpm2) C m 2
+ pmuludq %mm7, %mm4 C m 2
+ paddq %mm0, %mm6 C m 2
+ movd %mm6, -4(%edx) C m 2
+ psrlq $32, %mm6 C m 2
+ paddq %mm1, %mm6 C m 2
+ jmp L(1)
+
+L(3m):
+ movd (%eax), %mm0 C m 3
+ lea (%ebx), %ecx C inner loop count m 3
+ pmuludq %mm7, %mm0 C m 3
+ movd 4(%eax), %mm1 C m 3
+ pmuludq %mm7, %mm1 C m 3
+ movd 8(%eax), %mm4 C m 3
+ jmp L(lpm3) C m 3
+ ALIGN(16) C m 3
+L(lpm3):
+ pmuludq %mm7, %mm4 C m 3
+ paddq %mm0, %mm6 C m 3
+ movd 12(%eax), %mm3 C m 3
+ movd %mm6, (%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ pmuludq %mm7, %mm3 C m 3
+ paddq %mm1, %mm6 C m 3
+ movd 16(%eax), %mm0 C m 3
+ movd %mm6, 4(%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ pmuludq %mm7, %mm0 C m 3
+ paddq %mm4, %mm6 C m 3
+ movd 20(%eax), %mm1 C m 3
+ movd %mm6, 8(%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ pmuludq %mm7, %mm1 C m 3
+ paddq %mm3, %mm6 C m 3
+ movd 24(%eax), %mm4 C m 3
+ movd %mm6, 12(%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ lea 16(%eax), %eax C m 3
+ lea 16(%edx), %edx C m 3
+ sub $4, %ecx C m 3
+ ja L(lpm3) C m 3
+ pmuludq %mm7, %mm4 C m 3
+ paddq %mm0, %mm6 C m 3
+ movd %mm6, (%edx) C m 3
+ psrlq $32, %mm6 C m 3
+ paddq %mm1, %mm6 C m 3
+ jmp L(2)
+
+L(0m):
+ movd (%eax), %mm3 C m 0
+ lea (%ebx), %ecx C inner loop count m 0
+ pmuludq %mm7, %mm3 C m 0
+ movd 4(%eax), %mm0 C m 0
+ pmuludq %mm7, %mm0 C m 0
+ movd 8(%eax), %mm1 C m 0
+ jmp L(m00) C m 0
+ ALIGN(16) C m 0
+L(lpm0):
+ pmuludq %mm7, %mm4 C m 0
+ paddq %mm0, %mm6 C m 0
+ movd (%eax), %mm3 C m 0
+ movd %mm6, -12(%edx) C m 0
+ psrlq $32, %mm6 C m 0
+ pmuludq %mm7, %mm3 C m 0
+ paddq %mm1, %mm6 C m 0
+ movd 4(%eax), %mm0 C m 0
+ movd %mm6, -8(%edx) C m 0
+ psrlq $32, %mm6 C m 0
+ pmuludq %mm7, %mm0 C m 0
+ paddq %mm4, %mm6 C m 0
+ movd 8(%eax), %mm1 C m 0
+ movd %mm6, -4(%edx) C m 0
+ psrlq $32, %mm6 C m 0
+L(m00): pmuludq %mm7, %mm1 C m 0
+ paddq %mm3, %mm6 C m 0
+ movd 12(%eax), %mm4 C m 0
+ movd %mm6, (%edx) C m 0
+ psrlq $32, %mm6 C m 0
+ lea 16(%eax), %eax C m 0
+ lea 16(%edx), %edx C m 0
+ sub $4, %ecx C m 0
+ ja L(lpm0) C m 0
+ pmuludq %mm7, %mm4 C m 0
+ paddq %mm0, %mm6 C m 0
+ movd %mm6, -12(%edx) C m 0
+ psrlq $32, %mm6 C m 0
+ paddq %mm1, %mm6 C m 0
+ jmp L(3)
+
+L(outer):
+ lea 8(%edi), %edi C rp += 2
+ movd (%esi), %mm7 C am 3
+ mov %edi, %edx C rp2 = rp am 3
+ lea 4(%esi), %esi C up++ am 3
+ lea (%esi), %eax C up2 = up am 3
+ movd (%eax), %mm0 C am 3
+ lea (%ebx), %ecx C inner loop count am 3
+ pxor %mm6, %mm6 C am 3
+ pmuludq %mm7, %mm0 C am 3
+ movd 4(%eax), %mm1 C am 3
+ movd (%edx), %mm4 C am 3
+ pmuludq %mm7, %mm1 C am 3
+ movd 8(%eax), %mm2 C am 3
+ paddq %mm0, %mm4 C am 3
+ movd 4(%edx), %mm5 C am 3
+ jmp L(lam3) C am 3
+ ALIGN(16) C am 3
+L(lam3):
+ pmuludq %mm7, %mm2 C am 3
+ paddq %mm4, %mm6 C am 3
+ movd 12(%eax), %mm3 C am 3
+ paddq %mm1, %mm5 C am 3
+ movd 8(%edx), %mm4 C am 3
+ movd %mm6, (%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ pmuludq %mm7, %mm3 C am 3
+ paddq %mm5, %mm6 C am 3
+ movd 16(%eax), %mm0 C am 3
+ paddq %mm2, %mm4 C am 3
+ movd 12(%edx), %mm5 C am 3
+ movd %mm6, 4(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ pmuludq %mm7, %mm0 C am 3
+ paddq %mm4, %mm6 C am 3
+ movd 20(%eax), %mm1 C am 3
+ paddq %mm3, %mm5 C am 3
+ movd 16(%edx), %mm4 C am 3
+ movd %mm6, 8(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ pmuludq %mm7, %mm1 C am 3
+ paddq %mm5, %mm6 C am 3
+ movd 24(%eax), %mm2 C am 3
+ paddq %mm0, %mm4 C am 3
+ movd 20(%edx), %mm5 C am 3
+ movd %mm6, 12(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ lea 16(%eax), %eax C am 3
+ lea 16(%edx), %edx C am 3
+ sub $4, %ecx C am 3
+ ja L(lam3) C am 3
+ pmuludq %mm7, %mm2 C am 3
+ paddq %mm4, %mm6 C am 3
+ paddq %mm1, %mm5 C am 3
+ movd 8(%edx), %mm4 C am 3
+ movd %mm6, (%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ paddq %mm5, %mm6 C am 3
+ paddq %mm2, %mm4 C am 3
+L(2): movd %mm6, 4(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ paddq %mm4, %mm6 C am 3
+ movd %mm6, 8(%edx) C am 3
+ psrlq $32, %mm6 C am 3
+ movd %mm6, 12(%edx) C am 3
+
+ lea 8(%edi), %edi C rp += 2
+ movd (%esi), %mm7 C am 2
+ mov %edi, %edx C rp2 = rp am 2
+ lea 4(%esi), %esi C up++ am 2
+ lea (%esi), %eax C up2 = up am 2
+ movd (%eax), %mm1 C am 2
+ lea (%ebx), %ecx C inner loop count am 2
+ pxor %mm6, %mm6 C am 2
+ pmuludq %mm7, %mm1 C am 2
+ movd 4(%eax), %mm2 C am 2
+ movd (%edx), %mm5 C am 2
+ pmuludq %mm7, %mm2 C am 2
+ movd 8(%eax), %mm3 C am 2
+ paddq %mm1, %mm5 C am 2
+ movd 4(%edx), %mm4 C am 2
+ jmp L(am10) C am 2
+ ALIGN(16) C am 2
+L(lam2):
+ pmuludq %mm7, %mm2 C am 2
+ paddq %mm4, %mm6 C am 2
+ movd 8(%eax), %mm3 C am 2
+ paddq %mm1, %mm5 C am 2
+ movd 4(%edx), %mm4 C am 2
+ movd %mm6, -4(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+L(am10):
+ pmuludq %mm7, %mm3 C am 2
+ paddq %mm5, %mm6 C am 2
+ movd 12(%eax), %mm0 C am 2
+ paddq %mm2, %mm4 C am 2
+ movd 8(%edx), %mm5 C am 2
+ movd %mm6, (%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ pmuludq %mm7, %mm0 C am 2
+ paddq %mm4, %mm6 C am 2
+ movd 16(%eax), %mm1 C am 2
+ paddq %mm3, %mm5 C am 2
+ movd 12(%edx), %mm4 C am 2
+ movd %mm6, 4(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ pmuludq %mm7, %mm1 C am 2
+ paddq %mm5, %mm6 C am 2
+ movd 20(%eax), %mm2 C am 2
+ paddq %mm0, %mm4 C am 2
+ movd 16(%edx), %mm5 C am 2
+ movd %mm6, 8(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ lea 16(%eax), %eax C am 2
+ lea 16(%edx), %edx C am 2
+ sub $4, %ecx C am 2
+ ja L(lam2) C am 2
+ pmuludq %mm7, %mm2 C am 2
+ paddq %mm4, %mm6 C am 2
+ paddq %mm1, %mm5 C am 2
+ movd 4(%edx), %mm4 C am 2
+ movd %mm6, -4(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ paddq %mm5, %mm6 C am 2
+ paddq %mm2, %mm4 C am 2
+L(1): movd %mm6, (%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ paddq %mm4, %mm6 C am 2
+ movd %mm6, 4(%edx) C am 2
+ psrlq $32, %mm6 C am 2
+ movd %mm6, 8(%edx) C am 2
+
+ lea 8(%edi), %edi C rp += 2
+ movd (%esi), %mm7 C am 1
+ mov %edi, %edx C rp2 = rp am 1
+ lea 4(%esi), %esi C up++ am 1
+ lea (%esi), %eax C up2 = up am 1
+ movd (%eax), %mm2 C am 1
+ lea (%ebx), %ecx C inner loop count am 1
+ pxor %mm6, %mm6 C am 1
+ pmuludq %mm7, %mm2 C am 1
+ movd 4(%eax), %mm3 C am 1
+ movd (%edx), %mm4 C am 1
+ pmuludq %mm7, %mm3 C am 1
+ movd 8(%eax), %mm0 C am 1
+ paddq %mm2, %mm4 C am 1
+ movd 4(%edx), %mm5 C am 1
+ jmp L(am01) C am 1
+ ALIGN(16) C am 1
+L(lam1):
+ pmuludq %mm7, %mm2 C am 1
+ paddq %mm4, %mm6 C am 1
+ movd 4(%eax), %mm3 C am 1
+ paddq %mm1, %mm5 C am 1
+ movd (%edx), %mm4 C am 1
+ movd %mm6, -8(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ pmuludq %mm7, %mm3 C am 1
+ paddq %mm5, %mm6 C am 1
+ movd 8(%eax), %mm0 C am 1
+ paddq %mm2, %mm4 C am 1
+ movd 4(%edx), %mm5 C am 1
+ movd %mm6, -4(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+L(am01):
+ pmuludq %mm7, %mm0 C am 1
+ paddq %mm4, %mm6 C am 1
+ movd 12(%eax), %mm1 C am 1
+ paddq %mm3, %mm5 C am 1
+ movd 8(%edx), %mm4 C am 1
+ movd %mm6, (%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ pmuludq %mm7, %mm1 C am 1
+ paddq %mm5, %mm6 C am 1
+ movd 16(%eax), %mm2 C am 1
+ paddq %mm0, %mm4 C am 1
+ movd 12(%edx), %mm5 C am 1
+ movd %mm6, 4(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ lea 16(%eax), %eax C am 1
+ lea 16(%edx), %edx C am 1
+ sub $4, %ecx C am 1
+ ja L(lam1) C am 1
+ pmuludq %mm7, %mm2 C am 1
+ paddq %mm4, %mm6 C am 1
+ paddq %mm1, %mm5 C am 1
+ movd (%edx), %mm4 C am 1
+ movd %mm6, -8(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ paddq %mm5, %mm6 C am 1
+ paddq %mm2, %mm4 C am 1
+L(0): movd %mm6, -4(%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ paddq %mm4, %mm6 C am 1
+ movd %mm6, (%edx) C am 1
+ psrlq $32, %mm6 C am 1
+ movd %mm6, 4(%edx) C am 1
+
+ lea 8(%edi), %edi C rp += 2
+ movd (%esi), %mm7 C am 0
+ mov %edi, %edx C rp2 = rp am 0
+ lea 4(%esi), %esi C up++ am 0
+ lea (%esi), %eax C up2 = up am 0
+ movd (%eax), %mm3 C am 0
+ lea (%ebx), %ecx C inner loop count am 0
+ pxor %mm6, %mm6 C am 0
+ pmuludq %mm7, %mm3 C am 0
+ movd 4(%eax), %mm0 C am 0
+ movd (%edx), %mm5 C am 0
+ pmuludq %mm7, %mm0 C am 0
+ movd 8(%eax), %mm1 C am 0
+ paddq %mm3, %mm5 C am 0
+ movd 4(%edx), %mm4 C am 0
+ jmp L(am00) C am 0
+ ALIGN(16) C am 0
+L(lam0):
+ pmuludq %mm7, %mm2 C am 0
+ paddq %mm4, %mm6 C am 0
+ movd (%eax), %mm3 C am 0
+ paddq %mm1, %mm5 C am 0
+ movd -4(%edx), %mm4 C am 0
+ movd %mm6, -12(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ pmuludq %mm7, %mm3 C am 0
+ paddq %mm5, %mm6 C am 0
+ movd 4(%eax), %mm0 C am 0
+ paddq %mm2, %mm4 C am 0
+ movd (%edx), %mm5 C am 0
+ movd %mm6, -8(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ pmuludq %mm7, %mm0 C am 0
+ paddq %mm4, %mm6 C am 0
+ movd 8(%eax), %mm1 C am 0
+ paddq %mm3, %mm5 C am 0
+ movd 4(%edx), %mm4 C am 0
+ movd %mm6, -4(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+L(am00):
+ pmuludq %mm7, %mm1 C am 0
+ paddq %mm5, %mm6 C am 0
+ movd 12(%eax), %mm2 C am 0
+ paddq %mm0, %mm4 C am 0
+ movd 8(%edx), %mm5 C am 0
+ movd %mm6, (%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ lea 16(%eax), %eax C am 0
+ lea 16(%edx), %edx C am 0
+ sub $4, %ecx C am 0
+ ja L(lam0) C am 0
+ pmuludq %mm7, %mm2 C am 0
+ paddq %mm4, %mm6 C am 0
+ paddq %mm1, %mm5 C am 0
+ movd -4(%edx), %mm4 C am 0
+ movd %mm6, -12(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ paddq %mm5, %mm6 C am 0
+ paddq %mm2, %mm4 C am 0
+L(3): movd %mm6, -8(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ paddq %mm4, %mm6 C am 0
+ movd %mm6, -4(%edx) C am 0
+ psrlq $32, %mm6 C am 0
+ movd %mm6, (%edx) C am 0
+ sub $4, %ebx C am 0
+ ja L(outer) C am 0
+
+ mov %edi, %edx
+ mov %esi, %eax
+ pop %edi
+ pop %ebx
+ pop %esi
+
+L(am3): C up[un-1..un-3] x up[un-4]
+ lea 8(%edx), %edx C rp2 += 2
+ movd (%eax), %mm7
+ movd 4(%eax), %mm1
+ movd 8(%eax), %mm2
+ movd 12(%eax), %mm3
+ movd (%edx), %mm4
+ pmuludq %mm7, %mm1
+ movd 4(%edx), %mm5
+ pmuludq %mm7, %mm2
+ movd 8(%edx), %mm6
+ pmuludq %mm7, %mm3
+ paddq %mm1, %mm4
+ paddq %mm2, %mm5
+ paddq %mm3, %mm6
+ movd %mm4, (%edx)
+ psrlq $32, %mm4
+ paddq %mm5, %mm4
+ movd %mm4, 4(%edx)
+ psrlq $32, %mm4
+ paddq %mm6, %mm4
+ movd %mm4, 8(%edx)
+ psrlq $32, %mm4
+ movd %mm4, 12(%edx) C FIXME feed through!
+ lea 4(%eax), %eax
+
+L(am2): C up[un-1..un-2] x up[un-3]
+ lea 8(%edx), %edx C rp2 += 2
+ movd (%eax), %mm7
+ movd 4(%eax), %mm1
+ movd 8(%eax), %mm2
+ movd (%edx), %mm4
+ movd 4(%edx), %mm5
+ pmuludq %mm7, %mm1
+ pmuludq %mm7, %mm2
+ paddq %mm1, %mm4
+ paddq %mm2, %mm5
+ movd %mm4, (%edx)
+ psrlq $32, %mm4
+ paddq %mm5, %mm4
+ movd %mm4, 4(%edx)
+ psrlq $32, %mm4
+ movd %mm4, 8(%edx) C FIXME feed through!
+ lea 4(%eax), %eax
+
+L(am1): C up[un-1] x up[un-2]
+ lea 8(%edx), %edx C rp2 += 2
+ movd (%eax), %mm7
+ movd 4(%eax), %mm2
+ movd (%edx), %mm4
+ pmuludq %mm7, %mm2
+ paddq %mm2, %mm4
+ movd %mm4, (%edx)
+ psrlq $32, %mm4
+ movd %mm4, 4(%edx)
+
+C *** diag stuff, use elementary code for now
+
+ mov 4(%esp), %edx C rp
+ mov 8(%esp), %eax C up
+ mov 12(%esp), %ecx C un
+
+ movd (%eax), %mm2
+ pmuludq %mm2, %mm2 C src[0]^2
+
+ pcmpeqd %mm7, %mm7
+ psrlq $32, %mm7
+
+ movd 4(%edx), %mm3 C dst[1]
+
+ movd %mm2, (%edx)
+ psrlq $32, %mm2
+
+ psllq $1, %mm3 C 2*dst[1]
+ paddq %mm3, %mm2
+ movd %mm2, 4(%edx)
+ psrlq $32, %mm2
+
+ sub $2, %ecx
+
+L(diag):
+ movd 4(%eax), %mm0 C src limb
+ add $4, %eax
+ pmuludq %mm0, %mm0
+ movq %mm7, %mm1
+ pand %mm0, %mm1 C diagonal low
+ psrlq $32, %mm0 C diagonal high
+
+ movd 8(%edx), %mm3
+ psllq $1, %mm3 C 2*dst[i]
+ paddq %mm3, %mm1
+ paddq %mm1, %mm2
+ movd %mm2, 8(%edx)
+ psrlq $32, %mm2
+
+ movd 12(%edx), %mm3
+ psllq $1, %mm3 C 2*dst[i+1]
+ paddq %mm3, %mm0
+ paddq %mm0, %mm2
+ movd %mm2, 12(%edx)
+ add $8, %edx
+ psrlq $32, %mm2
+
+ sub $1, %ecx
+ jnz L(diag)
+
+ movd 4(%eax), %mm0 C src[size-1]
+ pmuludq %mm0, %mm0
+ pand %mm0, %mm7 C diagonal low
+ psrlq $32, %mm0 C diagonal high
+
+ movd 8(%edx), %mm3 C dst[2*size-2]
+ psllq $1, %mm3
+ paddq %mm3, %mm7
+ paddq %mm7, %mm2
+ movd %mm2, 8(%edx)
+ psrlq $32, %mm2
+
+ paddq %mm0, %mm2
+ movd %mm2, 12(%edx) C dst[2*size-1]
+
+ emms
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm
new file mode 100644
index 0000000..5ba1c01
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/sub_n.asm
@@ -0,0 +1,119 @@
+dnl Intel Pentium-4 mpn_sub_n -- mpn subtraction.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C dst!=src1,2 dst==src1 dst==src2
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) ?
+C P6 model 13 (Dothan) ?
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 4 6 6
+C P4 model 3-4 (Prescott) 4.25 7.5 7.5
+
+defframe(PARAM_CARRY,20)
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC2, 12)
+defframe(PARAM_SRC1, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_EBX,`PARAM_SRC1')
+
+ TEXT
+ ALIGN(8)
+
+PROLOGUE(mpn_sub_nc)
+deflit(`FRAME',0)
+ movd PARAM_CARRY, %mm0
+ jmp L(start_nc)
+EPILOGUE()
+
+ ALIGN(8)
+PROLOGUE(mpn_sub_n)
+deflit(`FRAME',0)
+ pxor %mm0, %mm0
+L(start_nc):
+ mov PARAM_SRC1, %eax
+ mov %ebx, SAVE_EBX
+ mov PARAM_SRC2, %ebx
+ mov PARAM_DST, %edx
+ mov PARAM_SIZE, %ecx
+
+ lea (%eax,%ecx,4), %eax C src1 end
+ lea (%ebx,%ecx,4), %ebx C src2 end
+ lea (%edx,%ecx,4), %edx C dst end
+ neg %ecx C -size
+
+L(top):
+ C eax src1 end
+ C ebx src2 end
+ C ecx counter, limbs, negative
+ C edx dst end
+ C mm0 carry bit
+
+ movd (%eax,%ecx,4), %mm1
+ movd (%ebx,%ecx,4), %mm2
+ psubq %mm2, %mm1
+
+ psubq %mm0, %mm1
+ movd %mm1, (%edx,%ecx,4)
+
+ psrlq $63, %mm1
+
+ add $1, %ecx
+ jz L(done_mm1)
+
+ movd (%eax,%ecx,4), %mm0
+ movd (%ebx,%ecx,4), %mm2
+ psubq %mm2, %mm0
+
+ psubq %mm1, %mm0
+ movd %mm0, (%edx,%ecx,4)
+
+ psrlq $63, %mm0
+
+ add $1, %ecx
+ jnz L(top)
+
+ movd %mm0, %eax
+ mov SAVE_EBX, %ebx
+ emms
+ ret
+
+L(done_mm1):
+ movd %mm1, %eax
+ mov SAVE_EBX, %ebx
+ emms
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm
new file mode 100644
index 0000000..020675b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm
@@ -0,0 +1,182 @@
+dnl Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl subtract the result from a second limb vector.
+
+dnl Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) 6.8
+C P6 model 13 (Dothan) 6.9
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 5.87
+C P4 model 3-4 (Prescott) 6.5
+
+C This code represents a step forwards compared to the code available before
+C GMP 5.1, but it is not carefully tuned for either P6 or P4. In fact, it is
+C not good for P6. For P4 it saved a bit over 1 c/l for both Northwood and
+C Prescott compared to the old code.
+C
+C The arrangements made here to get a two instruction dependent chain are
+C slightly subtle. In the loop the carry (or borrow rather) is a negative so
+C that a paddq can be used to give a low limb ready to store, and a high limb
+C ready to become the new carry after a psrlq.
+C
+C If the carry was a simple twos complement negative then the psrlq shift would
+C need to bring in 0 bits or 1 bits according to whether the high was zero or
+C non-zero, since a non-zero value would represent a negative needing sign
+C extension. That wouldn't be particularly easy to arrange and certainly would
+C add an instruction to the dependent chain, so instead an offset is applied so
+C that the high limb will be 0xFFFFFFFF+c. With c in the range -0xFFFFFFFF to
+C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore
+C always positive and can always have 0 bits shifted in, which is what psrlq
+C does.
+C
+C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be
+C done off the dependent chain. The total adjustment then is to add
+C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF
+C to remove the offset from the current carry, for a net add of
+C 0xFFFFFFFE00000001. In the code this is applied to the destination limb when
+C fetched.
+C
+C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement
+C negative, which is how it's undone for the return value, but that doesn't
+C seem as clear.
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_submul_1c)
+deflit(`FRAME',0)
+ movd PARAM_CARRY, %mm1
+ jmp L(start_1c)
+EPILOGUE()
+
+PROLOGUE(mpn_submul_1)
+deflit(`FRAME',0)
+ pxor %mm1, %mm1 C initial borrow
+
+L(start_1c):
+ mov PARAM_SRC, %eax
+ pcmpeqd %mm0, %mm0
+
+ movd PARAM_MULTIPLIER, %mm7
+ pcmpeqd %mm6, %mm6
+
+ mov PARAM_DST, %edx
+ psrlq $32, %mm0 C 0x00000000FFFFFFFF
+
+ mov PARAM_SIZE, %ecx
+ psllq $32, %mm6 C 0xFFFFFFFF00000000
+
+ psubq %mm0, %mm6 C 0xFFFFFFFE00000001
+
+ psubq %mm1, %mm0 C 0xFFFFFFFF - borrow
+
+
+ movd (%eax), %mm3 C up
+ movd (%edx), %mm4 C rp
+
+ add $-1, %ecx
+ paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
+ pmuludq %mm7, %mm3
+ jnz L(gt1)
+ psubq %mm3, %mm4 C prod
+ paddq %mm4, %mm0 C borrow
+ movd %mm0, (%edx) C result
+ jmp L(rt)
+
+L(gt1): movd 4(%eax), %mm1 C up
+ movd 4(%edx), %mm2 C rp
+
+ add $-1, %ecx
+ jz L(eev)
+
+ ALIGN(16)
+L(top): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001
+ pmuludq %mm7, %mm1
+ psubq %mm3, %mm4 C prod
+ movd 8(%eax), %mm3 C up
+ paddq %mm4, %mm0 C borrow
+ movd 8(%edx), %mm4 C rp
+ movd %mm0, (%edx) C result
+ psrlq $32, %mm0
+
+ add $-1, %ecx
+ jz L(eod)
+
+ paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
+ pmuludq %mm7, %mm3
+ psubq %mm1, %mm2 C prod
+ movd 12(%eax), %mm1 C up
+ paddq %mm2, %mm0 C borrow
+ movd 12(%edx), %mm2 C rp
+ movd %mm0, 4(%edx) C result
+ psrlq $32, %mm0
+
+ lea 8(%eax), %eax
+ lea 8(%edx), %edx
+ add $-1, %ecx
+ jnz L(top)
+
+
+L(eev): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001
+ pmuludq %mm7, %mm1
+ psubq %mm3, %mm4 C prod
+ paddq %mm4, %mm0 C borrow
+ movd %mm0, (%edx) C result
+ psrlq $32, %mm0
+ psubq %mm1, %mm2 C prod
+ paddq %mm2, %mm0 C borrow
+ movd %mm0, 4(%edx) C result
+L(rt): psrlq $32, %mm0
+ movd %mm0, %eax
+ not %eax
+ emms
+ ret
+
+L(eod): paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
+ pmuludq %mm7, %mm3
+ psubq %mm1, %mm2 C prod
+ paddq %mm2, %mm0 C borrow
+ movd %mm0, 4(%edx) C result
+ psrlq $32, %mm0
+ psubq %mm3, %mm4 C prod
+ paddq %mm4, %mm0 C borrow
+ movd %mm0, 8(%edx) C result
+ jmp L(rt)
+EPILOGUE()