aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86/k6/k62mmx
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/x86/k6/k62mmx')
-rw-r--r--gmp-6.3.0/mpn/x86/k6/k62mmx/copyd.asm118
-rw-r--r--gmp-6.3.0/mpn/x86/k6/k62mmx/lshift.asm294
-rw-r--r--gmp-6.3.0/mpn/x86/k6/k62mmx/rshift.asm293
3 files changed, 705 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86/k6/k62mmx/copyd.asm b/gmp-6.3.0/mpn/x86/k6/k62mmx/copyd.asm
new file mode 100644
index 0000000..f80a5a1
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/k6/k62mmx/copyd.asm
@@ -0,0 +1,118 @@
+dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
+
+dnl Copyright 2001, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6-2: 1.0 cycles/limb
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30
+C cycle startup time, which amounts for instance to a 2x speedup at 15
+C limbs.
+C
+C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by
+C processing one limb separately to make it aligned. This and a final odd
+C limb are handled in a branch-free fashion, ending up re-copying if the
+C special case isn't needed.
+C
+C Alternatives:
+C
+C There used to be a big unrolled version of this, running at 0.56 c/l if
+C the destination was aligned, but that seemed rather excessive for the
+C relative importance of copyd.
+C
+C If the destination alignment is ignored and just left to run at 1.17 c/l
+C some code size and a fixed few cycles can be saved. Considering how few
+C uses copyd finds perhaps that should be favoured. The current code has
+C the attraction of being no slower than a basic rep movsl though.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-using parameter space
+define(SAVE_EBX,`PARAM_SIZE')
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+ movl PARAM_SIZE, %ecx
+ movl %ebx, SAVE_EBX
+
+ movl PARAM_SRC, %eax
+ movl PARAM_DST, %edx
+
+ subl $1, %ecx C better code alignment than decl
+ jb L(zero)
+
+ jz L(one_more)
+ leal 4(%edx,%ecx,4), %ebx
+
+Zdisp( movd, 0,(%eax,%ecx,4), %mm0) C high limb
+Zdisp( movd, %mm0, 0,(%edx,%ecx,4)) C Zdisp for good code alignment
+
+ cmpl $1, %ecx
+ je L(one_more)
+
+ shrl $2, %ebx
+ andl $1, %ebx C 1 if dst[size-2] unaligned
+
+ subl %ebx, %ecx
+ nop C code alignment
+
+L(top):
+ C eax src
+ C ebx
+ C ecx counter
+ C edx dst
+
+ movq -4(%eax,%ecx,4), %mm0
+ subl $2, %ecx
+
+ movq %mm0, 4(%edx,%ecx,4)
+ ja L(top)
+
+
+L(one_more):
+ movd (%eax), %mm0
+ movd %mm0, (%edx)
+
+ movl SAVE_EBX, %ebx
+ emms_or_femms
+L(zero):
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/k6/k62mmx/lshift.asm b/gmp-6.3.0/mpn/x86/k6/k62mmx/lshift.asm
new file mode 100644
index 0000000..c86575f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/k6/k62mmx/lshift.asm
@@ -0,0 +1,294 @@
+dnl AMD K6-2 mpn_lshift -- mpn left shift.
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6-2: 1.75 cycles/limb
+
+
+C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl used after src has been fetched
+define(VAR_RETVAL,`PARAM_SRC')
+
+dnl minimum 9, because unrolled loop can't handle less
+deflit(UNROLL_THRESHOLD, 9)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(mpn_lshift)
+deflit(`FRAME',0)
+
+ C The 1 limb case can be done without the push %ebx, but it's then
+ C still the same speed. The push is left as a free helping hand for
+ C the two_or_more code.
+
+ movl PARAM_SIZE, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ decl %eax
+
+ movl PARAM_SHIFT, %ecx
+ jnz L(two_or_more)
+
+ movl (%ebx), %edx C src limb
+ movl PARAM_DST, %ebx
+
+ shldl( %cl, %edx, %eax) C return value
+
+ shll %cl, %edx
+
+ movl %edx, (%ebx) C dst limb
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16) C avoid offset 0x1f
+L(two_or_more):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx
+
+ movl (%ebx,%eax,4), %edx C src high limb
+ negl %ecx
+
+ movd PARAM_SHIFT, %mm6
+ addl $32, %ecx C 32-shift
+
+ shrl %cl, %edx
+ cmpl $UNROLL_THRESHOLD-1, %eax
+
+ movl %edx, VAR_RETVAL
+ jae L(unroll)
+
+
+ movd %ecx, %mm7
+ movl %eax, %ecx
+
+ movl PARAM_DST, %eax
+
+L(simple):
+ C eax dst
+ C ebx src
+ C ecx counter, size-1 to 1
+ C edx retval
+ C
+ C mm0 scratch
+ C mm6 shift
+ C mm7 32-shift
+
+ movq -4(%ebx,%ecx,4), %mm0
+
+ psrlq %mm7, %mm0
+
+Zdisp( movd, %mm0, 0,(%eax,%ecx,4))
+ loop L(simple)
+
+
+ movd (%ebx), %mm0
+ popl %ebx
+
+ psllq %mm6, %mm0
+
+ movd %mm0, (%eax)
+ movl %edx, %eax
+
+ femms
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll):
+ C eax size-1
+ C ebx src
+ C ecx 32-shift
+ C edx retval (but instead VAR_RETVAL is used)
+ C
+ C mm6 shift
+
+ addl $32, %ecx
+ movl PARAM_DST, %edx
+
+ movd %ecx, %mm7
+ subl $7, %eax C size-8
+
+ leal (%edx,%eax,4), %ecx C alignment of dst
+
+ movq 32-8(%ebx,%eax,4), %mm2 C src high qword
+ testb $4, %cl
+
+ jz L(dst_aligned)
+ psllq %mm6, %mm2
+
+ psrlq $32, %mm2
+ decl %eax
+
+ movd %mm2, 32(%edx,%eax,4) C dst high limb
+ movq 32-8(%ebx,%eax,4), %mm2 C new src high qword
+L(dst_aligned):
+
+ movq 32-16(%ebx,%eax,4), %mm0 C src second highest qword
+
+
+ C This loop is the important bit, the rest is just support for it.
+ C Four src limbs are held at the start, and four more will be read.
+ C Four dst limbs will be written. This schedule seems necessary for
+ C full speed.
+ C
+ C The use of size-8 lets the loop stop when %eax goes negative and
+ C leaves -4 to -1 which can be tested with test $1 and $2.
+
+L(top):
+ C eax counter, size-8 step by -4 until <0
+ C ebx src
+ C ecx
+ C edx dst
+ C
+ C mm0 src next qword
+ C mm1 scratch
+ C mm2 src prev qword
+ C mm6 shift
+ C mm7 64-shift
+
+ psllq %mm6, %mm2
+ subl $4, %eax
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ por %mm0, %mm2
+ movq 24(%ebx,%eax,4), %mm0
+
+ psllq %mm6, %mm1
+ movq %mm2, 40(%edx,%eax,4)
+
+ movq %mm0, %mm2
+ psrlq %mm7, %mm0
+
+ por %mm0, %mm1
+ movq 16(%ebx,%eax,4), %mm0
+
+ movq %mm1, 32(%edx,%eax,4)
+ jnc L(top)
+
+
+ C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
+ C
+ C 8(%ebx) is the next source, and 24(%edx) is the next destination.
+ C %eax is between -4 and -1, representing respectively 0 to 3 extra
+ C limbs that must be read.
+
+
+ testl $2, %eax C testl to avoid bad cache line crossing
+ jz L(finish_nottwo)
+
+ C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
+ C new mm2 and a new mm0 is loaded.
+
+ psllq %mm6, %mm2
+ movq %mm0, %mm1
+
+ psrlq %mm7, %mm0
+ subl $2, %eax
+
+ por %mm0, %mm2
+ movq 16(%ebx,%eax,4), %mm0
+
+ movq %mm2, 32(%edx,%eax,4)
+ movq %mm1, %mm2
+L(finish_nottwo):
+
+
+ C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0
+
+ testb $1, %al
+ psllq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psrlq %mm7, %mm0
+
+ por %mm0, %mm2
+ psllq %mm6, %mm1
+
+ movq %mm2, 24(%edx,%eax,4)
+ jz L(finish_even)
+
+
+ C Size is odd, so mm1 and one extra limb to process.
+
+ movd (%ebx), %mm0 C src[0]
+ popl %ebx
+deflit(`FRAME',0)
+
+ movq %mm0, %mm2
+ psllq $32, %mm0
+
+ psrlq %mm7, %mm0
+
+ psllq %mm6, %mm2
+ por %mm0, %mm1
+
+ movq %mm1, 4(%edx) C dst[1,2]
+ movd %mm2, (%edx) C dst[0]
+
+ movl VAR_RETVAL, %eax
+
+ femms
+ ret
+
+
+ nop C avoid bad cache line crossing
+L(finish_even):
+deflit(`FRAME',4)
+ C Size is even, so only mm1 left to process.
+
+ movq %mm1, (%edx) C dst[0,1]
+ movl VAR_RETVAL, %eax
+
+ popl %ebx
+ femms
+ ret
+
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/x86/k6/k62mmx/rshift.asm b/gmp-6.3.0/mpn/x86/k6/k62mmx/rshift.asm
new file mode 100644
index 0000000..f604a7b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/k6/k62mmx/rshift.asm
@@ -0,0 +1,293 @@
+dnl AMD K6-2 mpn_rshift -- mpn right shift.
+
+dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6-2: 1.75 cycles/limb
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C unsigned shift);
+C
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+dnl Minimum 9, because the unrolled loop can't handle less.
+dnl
+deflit(UNROLL_THRESHOLD, 9)
+
+ TEXT
+ ALIGN(32)
+
+PROLOGUE(mpn_rshift)
+deflit(`FRAME',0)
+
+ C The 1 limb case can be done without the push %ebx, but it's then
+ C still the same speed. The push is left as a free helping hand for
+ C the two_or_more code.
+
+ movl PARAM_SIZE, %eax
+ pushl %ebx FRAME_pushl()
+
+ movl PARAM_SRC, %ebx
+ decl %eax
+
+ movl PARAM_SHIFT, %ecx
+ jnz L(two_or_more)
+
+ movl (%ebx), %edx C src limb
+ movl PARAM_DST, %ebx
+
+ shrdl( %cl, %edx, %eax) C return value
+
+ shrl %cl, %edx
+
+ movl %edx, (%ebx) C dst limb
+ popl %ebx
+
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16) C avoid offset 0x1f
+L(two_or_more):
+ C eax size-1
+ C ebx src
+ C ecx shift
+ C edx
+
+ movl (%ebx), %edx C src low limb
+ negl %ecx
+
+ addl $32, %ecx
+ movd PARAM_SHIFT, %mm6
+
+ shll %cl, %edx
+ cmpl $UNROLL_THRESHOLD-1, %eax
+
+ jae L(unroll)
+
+
+ C eax size-1
+ C ebx src
+ C ecx 32-shift
+ C edx retval
+ C
+ C mm6 shift
+
+ movl PARAM_DST, %ecx
+ leal (%ebx,%eax,4), %ebx
+
+ leal -4(%ecx,%eax,4), %ecx
+ negl %eax
+
+ C This loop runs at about 3 cycles/limb, which is the amount of
+ C decoding, and this is despite every second access being unaligned.
+
+L(simple):
+ C eax counter, -(size-1) to -1
+ C ebx &src[size-1]
+ C ecx &dst[size-1]
+ C edx retval
+ C
+ C mm0 scratch
+ C mm6 shift
+
+Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
+ incl %eax
+
+ psrlq %mm6, %mm0
+
+Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
+ jnz L(simple)
+
+
+ movq %mm0, (%ecx)
+ movl %edx, %eax
+
+ popl %ebx
+
+ femms
+ ret
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll):
+ C eax size-1
+ C ebx src
+ C ecx 32-shift
+ C edx retval
+ C
+ C mm6 shift
+
+ addl $32, %ecx
+ subl $7, %eax C size-8
+
+ movd %ecx, %mm7
+ movl PARAM_DST, %ecx
+
+ movq (%ebx), %mm2 C src low qword
+ leal (%ebx,%eax,4), %ebx C src end - 32
+
+ testb $4, %cl
+ leal (%ecx,%eax,4), %ecx C dst end - 32
+
+ notl %eax C -(size-7)
+ jz L(dst_aligned)
+
+ psrlq %mm6, %mm2
+ incl %eax
+
+Zdisp( movd, %mm2, 0,(%ecx,%eax,4)) C dst low limb
+ movq 4(%ebx,%eax,4), %mm2 C new src low qword
+L(dst_aligned):
+
+ movq 12(%ebx,%eax,4), %mm0 C src second lowest qword
+ nop C avoid bad cache line crossing
+
+
+ C This loop is the important bit, the rest is just support for it.
+ C Four src limbs are held at the start, and four more will be read.
+ C Four dst limbs will be written. This schedule seems necessary for
+ C full speed.
+ C
+ C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
+ C and leaves 0 to 3 which can be tested with test $1 and $2.
+
+L(top):
+ C eax counter, -(size-7) step by +4 until >=0
+ C ebx src end - 32
+ C ecx dst end - 32
+ C edx retval
+ C
+ C mm0 src next qword
+ C mm1 scratch
+ C mm2 src prev qword
+ C mm6 shift
+ C mm7 64-shift
+
+ psrlq %mm6, %mm2
+ addl $4, %eax
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ por %mm0, %mm2
+ movq 4(%ebx,%eax,4), %mm0
+
+ psrlq %mm6, %mm1
+ movq %mm2, -12(%ecx,%eax,4)
+
+ movq %mm0, %mm2
+ psllq %mm7, %mm0
+
+ por %mm0, %mm1
+ movq 12(%ebx,%eax,4), %mm0
+
+ movq %mm1, -4(%ecx,%eax,4)
+ ja L(top) C jump if no carry and not zero
+
+
+
+ C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
+ C to 3 representing respectively 3 to 0 further limbs.
+
+ testl $2, %eax C testl to avoid bad cache line crossings
+ jnz L(finish_nottwo)
+
+ C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
+ C becomes new mm2 and a new mm0 is loaded.
+
+ psrlq %mm6, %mm2
+ movq %mm0, %mm1
+
+ psllq %mm7, %mm0
+ addl $2, %eax
+
+ por %mm0, %mm2
+ movq 12(%ebx,%eax,4), %mm0
+
+ movq %mm2, -4(%ecx,%eax,4)
+ movq %mm1, %mm2
+L(finish_nottwo):
+
+
+ testb $1, %al
+ psrlq %mm6, %mm2
+
+ movq %mm0, %mm1
+ psllq %mm7, %mm0
+
+ por %mm0, %mm2
+ psrlq %mm6, %mm1
+
+ movq %mm2, 4(%ecx,%eax,4)
+ jnz L(finish_even)
+
+
+ C one further extra limb to process
+
+ movd 32-4(%ebx), %mm0 C src[size-1], most significant limb
+ popl %ebx
+
+ movq %mm0, %mm2
+ psllq %mm7, %mm0
+
+ por %mm0, %mm1
+ psrlq %mm6, %mm2
+
+ movq %mm1, 32-12(%ecx) C dst[size-3,size-2]
+ movd %mm2, 32-4(%ecx) C dst[size-1]
+
+ movl %edx, %eax C retval
+
+ femms
+ ret
+
+
+ nop C avoid bad cache line crossing
+L(finish_even):
+ C no further extra limbs
+
+ movq %mm1, 32-8(%ecx) C dst[size-2,size-1]
+ movl %edx, %eax C retval
+
+ popl %ebx
+
+ femms
+ ret
+
+EPILOGUE()