aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86/k6/mmx
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/x86/k6/mmx')
-rw-r--r--gmp-6.3.0/mpn/x86/k6/mmx/com.asm103
-rw-r--r--gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm282
-rw-r--r--gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm226
-rw-r--r--gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm130
-rw-r--r--gmp-6.3.0/mpn/x86/k6/mmx/popham.asm236
-rw-r--r--gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm130
6 files changed, 1107 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86/k6/mmx/com.asm b/gmp-6.3.0/mpn/x86/k6/mmx/com.asm
new file mode 100644
index 0000000..b747454
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/k6/mmx/com.asm
@@ -0,0 +1,103 @@
+dnl AMD K6-2 mpn_com -- mpn bitwise one's complement.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+NAILS_SUPPORT(0-31)
+
+
+C alignment dst/src, A=0mod8 N=4mod8
+C A/A A/N N/A N/N
+C K6-2 1.0 1.18 1.18 1.18 cycles/limb
+C K6 1.5 1.85 1.75 1.85
+
+
+C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Take the bitwise ones-complement of src,size and write it to dst,size.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_com)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %edx
+ shrl %ecx
+ jnz L(two_or_more)
+
+ movl (%eax), %eax
+ notl_or_xorl_GMP_NUMB_MASK( %eax)
+ movl %eax, (%edx)
+ ret
+
+
+L(two_or_more):
+ pushl %ebx FRAME_pushl()
+ pcmpeqd %mm7, %mm7 C all ones
+
+ movl %ecx, %ebx
+ifelse(GMP_NAIL_BITS,0,,
+` psrld $GMP_NAIL_BITS, %mm7') C clear nails
+
+
+
+ ALIGN(8)
+L(top):
+ C eax src
+ C ebx floor(size/2)
+ C ecx counter
+ C edx dst
+ C
+ C mm0 scratch
+ C mm7 mask
+
+ movq -8(%eax,%ecx,8), %mm0
+ pxor %mm7, %mm0
+ movq %mm0, -8(%edx,%ecx,8)
+ loop L(top)
+
+
+ jnc L(no_extra)
+ movl (%eax,%ebx,8), %eax
+ notl_or_xorl_GMP_NUMB_MASK( %eax)
+ movl %eax, (%edx,%ebx,8)
+L(no_extra):
+
+ popl %ebx
+ emms_or_femms
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm b/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm
new file mode 100644
index 0000000..1bbad3a
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/k6/mmx/dive_1.asm
@@ -0,0 +1,282 @@
+dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C divisor
+C odd even
+C K6: 10.0 12.0 cycles/limb
+C K6-2: 10.0 11.5
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+C A simple divl is used for size==1. This is about 10 cycles faster for an
+C odd divisor or 20 cycles for an even divisor.
+C
+C The loops are quite sensitive to code alignment, speeds should be
+C rechecked (odd and even divisor, pic and non-pic) if contemplating
+C changing anything.
+
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_INVERSE,`PARAM_DST')
+
+ TEXT
+
+ ALIGN(32)
+PROLOGUE(mpn_divexact_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+
+ movl PARAM_SRC, %eax
+ xorl %edx, %edx
+
+ cmpl $1, %ecx
+ jnz L(two_or_more)
+
+ movl (%eax), %eax
+
+ divl PARAM_DIVISOR
+
+ movl PARAM_DST, %ecx
+ movl %eax, (%ecx)
+
+ ret
+
+
+L(two_or_more):
+ movl PARAM_DIVISOR, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ pushl %ebp FRAME_pushl()
+
+L(strip_twos):
+ shrl %eax
+ incl %edx C will get shift+1
+
+ jnc L(strip_twos)
+ pushl %esi FRAME_pushl()
+
+ leal 1(%eax,%eax), %esi C d without twos
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %ebp)
+Zdisp( movzbl, 0,(%eax,%ebp), %eax)
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+ pushl %edi FRAME_pushl()
+
+ leal (%eax,%eax), %ebp C 2*inv
+
+ imull %eax, %eax C inv*inv
+
+ movl PARAM_DST, %edi
+
+ imull %esi, %eax C inv*inv*d
+
+ subl %eax, %ebp C inv = 2*inv - inv*inv*d
+ leal (%ebp,%ebp), %eax C 2*inv
+
+ imull %ebp, %ebp C inv*inv
+
+ movl %esi, PARAM_DIVISOR C d without twos
+ leal (%ebx,%ecx,4), %ebx C src end
+
+ imull %esi, %ebp C inv*inv*d
+
+ leal (%edi,%ecx,4), %edi C dst end
+ negl %ecx C -size
+
+ subl %ebp, %eax C inv = 2*inv - inv*inv*d
+ subl $1, %edx C shift amount, and clear carry
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ movl %eax, VAR_INVERSE
+ jnz L(even)
+
+ movl (%ebx,%ecx,4), %esi C src low limb
+ jmp L(odd_entry)
+
+
+ ALIGN(16)
+ nop C code alignment
+L(odd_top):
+ C eax scratch
+ C ebx src end
+ C ecx counter, limbs, negative
+ C edx inverse
+ C esi next limb, adjusted for carry
+ C edi dst end
+ C ebp carry bit, 0 or -1
+
+ imull %edx, %esi
+
+ movl PARAM_DIVISOR, %eax
+ movl %esi, -4(%edi,%ecx,4)
+
+ mull %esi C carry limb in edx
+
+ subl %ebp, %edx C apply carry bit
+ movl (%ebx,%ecx,4), %esi
+
+L(odd_entry):
+ subl %edx, %esi C apply carry limb
+ movl VAR_INVERSE, %edx
+
+ sbbl %ebp, %ebp C 0 or -1
+
+ incl %ecx
+ jnz L(odd_top)
+
+
+ imull %edx, %esi
+
+ movl %esi, -4(%edi,%ecx,4)
+
+ popl %edi
+ popl %esi
+
+ popl %ebp
+ popl %ebx
+
+ ret
+
+
+L(even):
+ C eax
+ C ebx src end
+ C ecx -size
+ C edx twos
+ C esi
+ C edi dst end
+ C ebp
+
+ xorl %ebp, %ebp
+Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1]
+
+ movd %edx, %mm7
+ movl VAR_INVERSE, %edx
+
+ addl $2, %ecx
+ psrlq %mm7, %mm0
+
+ movd %mm0, %esi
+ jz L(even_two) C if only two limbs
+
+
+C Out-of-order execution is good enough to hide the load/rshift/movd
+C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12,
+C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has
+C been found. Maybe the fact every second movq is unaligned costs the extra
+C 0.5.
+
+L(even_top):
+ C eax scratch
+ C ebx src end
+ C ecx counter, limbs, negative
+ C edx inverse
+ C esi next limb, adjusted for carry
+ C edi dst end
+ C ebp carry bit, 0 or -1
+ C
+ C mm0 scratch, source limbs
+ C mm7 twos
+
+ imull %edx, %esi
+
+ movl %esi, -8(%edi,%ecx,4)
+ movl PARAM_DIVISOR, %eax
+
+ mull %esi C carry limb in edx
+
+ movq -4(%ebx,%ecx,4), %mm0
+ psrlq %mm7, %mm0
+
+ movd %mm0, %esi
+ subl %ebp, %edx C apply carry bit
+
+ subl %edx, %esi C apply carry limb
+ movl VAR_INVERSE, %edx
+
+ sbbl %ebp, %ebp C 0 or -1
+
+ incl %ecx
+ jnz L(even_top)
+
+
+L(even_two):
+ movd -4(%ebx), %mm0 C src high limb
+ psrlq %mm7, %mm0
+
+ imull %edx, %esi
+
+ movl %esi, -8(%edi)
+ movl PARAM_DIVISOR, %eax
+
+ mull %esi C carry limb in edx
+
+ movd %mm0, %esi
+ subl %ebp, %edx C apply carry bit
+
+ movl VAR_INVERSE, %eax
+ subl %edx, %esi C apply carry limb
+
+ imull %eax, %esi
+
+ movl %esi, -4(%edi)
+
+ popl %edi
+ popl %esi
+
+ popl %ebp
+ popl %ebx
+
+ emms_or_femms
+
+ ret
+
+EPILOGUE()
+ASM_END()
diff --git a/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm b/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm
new file mode 100644
index 0000000..e17930b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/k6/mmx/logops_n.asm
@@ -0,0 +1,226 @@
+dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
+dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
+
+dnl Copyright 1999-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+NAILS_SUPPORT(0-31)
+
+
+C alignment dst/src1/src2, A=0mod8, N=4mod8
+C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
+C
+C K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor
+C K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor
+C K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior
+C
+C K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor
+C K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor
+C K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior
+
+
+dnl M4_p and M4_i are the MMX and integer instructions
+dnl M4_*_neg_dst means whether to negate the final result before writing
+dnl M4_*_neg_src2 means whether to negate the src2 values before using them
+
+define(M4_choose_op,
+m4_assert_numargs(7)
+`ifdef(`OPERATION_$1',`
+define(`M4_function', `mpn_$1')
+define(`M4_operation', `$1')
+define(`M4_p', `$2')
+define(`M4_p_neg_dst', `$3')
+define(`M4_p_neg_src2',`$4')
+define(`M4_i', `$5')
+define(`M4_i_neg_dst', `$6')
+define(`M4_i_neg_src2',`$7')
+')')
+
+dnl xnor is done in "iorn" style because it's a touch faster than "nior"
+dnl style (the two are equivalent for xor).
+dnl
+dnl pandn can't be used with nails.
+
+M4_choose_op( and_n, pand,0,0, andl,0,0)
+ifelse(GMP_NAIL_BITS,0,
+`M4_choose_op(andn_n, pandn,0,0, andl,0,1)',
+`M4_choose_op(andn_n, pand,0,1, andl,0,1)')
+M4_choose_op( nand_n, pand,1,0, andl,1,0)
+M4_choose_op( ior_n, por,0,0, orl,0,0)
+M4_choose_op( iorn_n, por,0,1, orl,0,1)
+M4_choose_op( nior_n, por,1,0, orl,1,0)
+M4_choose_op( xor_n, pxor,0,0, xorl,0,0)
+M4_choose_op( xnor_n, pxor,0,1, xorl,0,1)
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+
+C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
+C mp_size_t size);
+C
+C Do src1,size M4_operation src2,size, storing the result in dst,size.
+C
+C Unaligned movq loads and stores are a bit slower than aligned ones. The
+C test at the start of the routine checks the alignment of src1 and if
+C necessary processes one limb separately at the low end to make it aligned.
+C
+C The raw speeds without this alignment switch are as follows.
+C
+C alignment dst/src1/src2, A=0mod8, N=4mod8
+C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
+C
+C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor
+C K6 1.75 2.2 2.0 2.28 iorn,xnor
+C K6 2.0 2.25 2.35 2.28 nand,nior
+C
+C
+C Future:
+C
+C K6 can do one 64-bit load per cycle so each of these routines should be
+C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be
+C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
+C The others are 4 instructions per 2 limbs, and so can only approach 1.0
+C because there's nowhere to hide some loop control.
+
+defframe(PARAM_SIZE,16)
+defframe(PARAM_SRC2,12)
+defframe(PARAM_SRC1,8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(M4_function)
+ movl PARAM_SIZE, %ecx
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC1, %eax
+
+ movl PARAM_SRC2, %ebx
+ cmpl $1, %ecx
+
+ movl PARAM_DST, %edx
+ ja L(two_or_more)
+
+
+ movl (%ebx), %ecx
+ popl %ebx
+ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)')
+ M4_i (%eax), %ecx
+ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)')
+ movl %ecx, (%edx)
+
+ ret
+
+
+L(two_or_more):
+ C eax src1
+ C ebx src2
+ C ecx size
+ C edx dst
+ C esi
+ C edi
+ C ebp
+
+ pushl %esi FRAME_pushl()
+ testl $4, %eax
+ jz L(alignment_ok)
+
+ movl (%ebx), %esi
+ addl $4, %ebx
+ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %esi)')
+ M4_i (%eax), %esi
+ addl $4, %eax
+ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %esi)')
+ movl %esi, (%edx)
+ addl $4, %edx
+ decl %ecx
+
+L(alignment_ok):
+ movl %ecx, %esi
+ shrl %ecx
+ jnz L(still_two_or_more)
+
+ movl (%ebx), %ecx
+ popl %esi
+ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)')
+ M4_i (%eax), %ecx
+ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)')
+ popl %ebx
+ movl %ecx, (%edx)
+ ret
+
+
+L(still_two_or_more):
+ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
+ pcmpeqd %mm7, %mm7 C all ones
+ifelse(GMP_NAIL_BITS,0,,`psrld $GMP_NAIL_BITS, %mm7') C clear nails
+')
+
+ ALIGN(16)
+L(top):
+ C eax src1
+ C ebx src2
+ C ecx counter
+ C edx dst
+ C esi
+ C edi
+ C ebp
+ C
+ C carry bit is low of size
+
+ movq -8(%ebx,%ecx,8), %mm0
+ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0')
+ M4_p -8(%eax,%ecx,8), %mm0
+ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0')
+ movq %mm0, -8(%edx,%ecx,8)
+
+ loop L(top)
+
+
+ jnc L(no_extra)
+
+ movl -4(%ebx,%esi,4), %ebx
+ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ebx)')
+ M4_i -4(%eax,%esi,4), %ebx
+ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ebx)')
+ movl %ebx, -4(%edx,%esi,4)
+L(no_extra):
+
+ popl %esi
+ popl %ebx
+ emms_or_femms
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm b/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm
new file mode 100644
index 0000000..45be582
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/k6/mmx/lshift.asm
@@ -0,0 +1,130 @@
+dnl AMD K6 mpn_lshift -- mpn left shift.
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 3.0 cycles/limb
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions. This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+ C The 1 limb case can be done without the push %ebx, but it's then
+ C still the same speed. The push is left as a free helping hand for
+ C the two_or_more code.
+
+ movl PARAM_SIZE, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ decl %eax
+
+ movl PARAM_SHIFT, %ecx
+ jnz L(two_or_more)
+
+ movl (%ebx), %edx C src limb
+ movl PARAM_DST, %ebx
+
+ shldl( %cl, %edx, %eax) C return value
+
+ shll %cl, %edx
+
+ movl %edx, (%ebx) C dst limb
+ popl %ebx
+
+ ret
+
+
+ ALIGN(16) C avoid offset 0x1f
+ nop C avoid bad cache line crossing
+L(two_or_more):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx
+
+ movl (%ebx,%eax,4), %edx C src high limb
+ negl %ecx
+
+ movd PARAM_SHIFT, %mm6
+ addl $32, %ecx C 32-shift
+
+ shrl %cl, %edx
+
+ movd %ecx, %mm7
+ movl PARAM_DST, %ecx
+
+L(top):
+ C eax counter, size-1 to 1
+ C ebx src
+ C ecx dst
+ C edx retval
+ C
+ C mm0 scratch
+ C mm6 shift
+ C mm7 32-shift
+
+ movq -4(%ebx,%eax,4), %mm0
+ decl %eax
+
+ psrlq %mm7, %mm0
+
+ movd %mm0, 4(%ecx,%eax,4)
+ jnz L(top)
+
+
+ movd (%ebx), %mm0
+ popl %ebx
+
+ psllq %mm6, %mm0
+ movl %edx, %eax
+
+ movd %mm0, (%ecx)
+
+ emms
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm b/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm
new file mode 100644
index 0000000..2b19d0b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/k6/mmx/popham.asm
@@ -0,0 +1,236 @@
+dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
+dnl hamming distance.
+
+dnl Copyright 2000-2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C popcount hamdist
+C K6-2: 9.0 11.5 cycles/limb
+C K6: 12.5 13.0
+
+
+C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
+C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
+C
+C The code here isn't optimal, but it's already a 2x speedup over the plain
+C integer mpn/generic/popcount.c,hamdist.c.
+
+
+ifdef(`OPERATION_popcount',,
+`ifdef(`OPERATION_hamdist',,
+`m4_error(`Need OPERATION_popcount or OPERATION_hamdist
+')m4exit(1)')')
+
+define(HAM,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_hamdist',`$1')')
+
+define(POP,
+m4_assert_numargs(1)
+`ifdef(`OPERATION_popcount',`$1')')
+
+HAM(`
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_SRC, 4)
+define(M4_function,mpn_hamdist)
+')
+POP(`
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+define(M4_function,mpn_popcount)
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+
+ifdef(`PIC',,`
+ dnl non-PIC
+
+ RODATA
+ ALIGN(8)
+
+L(rodata_AAAAAAAAAAAAAAAA):
+ .long 0xAAAAAAAA
+ .long 0xAAAAAAAA
+
+L(rodata_3333333333333333):
+ .long 0x33333333
+ .long 0x33333333
+
+L(rodata_0F0F0F0F0F0F0F0F):
+ .long 0x0F0F0F0F
+ .long 0x0F0F0F0F
+
+L(rodata_000000FF000000FF):
+ .long 0x000000FF
+ .long 0x000000FF
+')
+
+ TEXT
+ ALIGN(32)
+
+POP(`ifdef(`PIC', `
+ C avoid shrl crossing a 32-byte boundary
+ nop')')
+
+PROLOGUE(M4_function)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+
+ifdef(`PIC',`
+ movl $0xAAAAAAAA, %eax
+ movl $0x33333333, %edx
+
+ movd %eax, %mm7
+ movd %edx, %mm6
+
+ movl $0x0F0F0F0F, %eax
+ movl $0x000000FF, %edx
+
+ punpckldq %mm7, %mm7
+ punpckldq %mm6, %mm6
+
+ movd %eax, %mm5
+ movd %edx, %mm4
+
+ punpckldq %mm5, %mm5
+ punpckldq %mm4, %mm4
+',`
+
+ movq L(rodata_AAAAAAAAAAAAAAAA), %mm7
+ movq L(rodata_3333333333333333), %mm6
+ movq L(rodata_0F0F0F0F0F0F0F0F), %mm5
+ movq L(rodata_000000FF000000FF), %mm4
+')
+
+define(REG_AAAAAAAAAAAAAAAA, %mm7)
+define(REG_3333333333333333, %mm6)
+define(REG_0F0F0F0F0F0F0F0F, %mm5)
+define(REG_000000FF000000FF, %mm4)
+
+
+ movl PARAM_SRC, %eax
+HAM(` movl PARAM_SRC2, %edx')
+
+ pxor %mm2, %mm2 C total
+
+ shrl %ecx
+ jnc L(top)
+
+Zdisp( movd, 0,(%eax,%ecx,8), %mm1)
+
+HAM(`
+Zdisp( movd, 0,(%edx,%ecx,8), %mm0)
+ pxor %mm0, %mm1
+')
+
+ incl %ecx
+ jmp L(loaded)
+
+
+ ALIGN(16)
+POP(` nop C alignment to avoid crossing 32-byte boundaries')
+
+L(top):
+ C eax src
+ C ebx
+ C ecx counter, qwords, decrementing
+ C edx [hamdist] src2
+ C
+ C mm0 (scratch)
+ C mm1 (scratch)
+ C mm2 total (low dword)
+ C mm3
+ C mm4 \
+ C mm5 | special constants
+ C mm6 |
+ C mm7 /
+
+ movq -8(%eax,%ecx,8), %mm1
+HAM(` pxor -8(%edx,%ecx,8), %mm1')
+
+L(loaded):
+ movq %mm1, %mm0
+ pand REG_AAAAAAAAAAAAAAAA, %mm1
+
+ psrlq $1, %mm1
+HAM(` nop C code alignment')
+
+ psubd %mm1, %mm0 C bit pairs
+HAM(` nop C code alignment')
+
+
+ movq %mm0, %mm1
+ psrlq $2, %mm0
+
+ pand REG_3333333333333333, %mm0
+ pand REG_3333333333333333, %mm1
+
+ paddd %mm1, %mm0 C nibbles
+
+
+ movq %mm0, %mm1
+ psrlq $4, %mm0
+
+ pand REG_0F0F0F0F0F0F0F0F, %mm0
+ pand REG_0F0F0F0F0F0F0F0F, %mm1
+
+ paddd %mm1, %mm0 C bytes
+
+ movq %mm0, %mm1
+ psrlq $8, %mm0
+
+
+ paddb %mm1, %mm0 C words
+
+
+ movq %mm0, %mm1
+ psrlq $16, %mm0
+
+ paddd %mm1, %mm0 C dwords
+
+ pand REG_000000FF000000FF, %mm0
+
+ paddd %mm0, %mm2 C low to total
+ psrlq $32, %mm0
+
+ paddd %mm0, %mm2 C high to total
+ loop L(top)
+
+
+
+ movd %mm2, %eax
+ emms_or_femms
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm b/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm
new file mode 100644
index 0000000..cd0382f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/k6/mmx/rshift.asm
@@ -0,0 +1,130 @@
+dnl AMD K6 mpn_rshift -- mpn right shift.
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 3.0 cycles/limb
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+C The loop runs at 3 cycles/limb, limited by decoding and by having 3 mmx
+C instructions. This is despite every second fetch being unaligned.
+
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+ C The 1 limb case can be done without the push %ebx, but it's then
+ C still the same speed. The push is left as a free helping hand for
+ C the two_or_more code.
+
+ movl PARAM_SIZE, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ decl %eax
+
+ movl PARAM_SHIFT, %ecx
+ jnz L(two_or_more)
+
+ movl (%ebx), %edx C src limb
+ movl PARAM_DST, %ebx
+
+ shrdl( %cl, %edx, %eax) C return value
+
+ shrl %cl, %edx
+
+ movl %edx, (%ebx) C dst limb
+ popl %ebx
+
+ ret
+
+
+ ALIGN(16) C avoid offset 0x1f
+L(two_or_more):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx
+
+ movl (%ebx), %edx C src low limb
+ negl %ecx
+
+ addl $32, %ecx C 32-shift
+ movd PARAM_SHIFT, %mm6
+
+ shll %cl, %edx C retval
+ movl PARAM_DST, %ecx
+
+ leal (%ebx,%eax,4), %ebx
+
+ leal -4(%ecx,%eax,4), %ecx
+ negl %eax
+
+
+L(simple):
+ C eax counter (negative)
+ C ebx &src[size-1]
+ C ecx &dst[size-1]
+ C edx retval
+ C
+ C mm0 scratch
+ C mm6 shift
+
+Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
+ incl %eax
+
+ psrlq %mm6, %mm0
+
+Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
+ jnz L(simple)
+
+
+ movq %mm0, (%ecx)
+ movl %edx, %eax
+
+ popl %ebx
+
+ emms
+ ret
+
+EPILOGUE()