aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm')
-rw-r--r--gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm182
1 files changed, 182 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm
new file mode 100644
index 0000000..020675b
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/submul_1.asm
@@ -0,0 +1,182 @@
+dnl Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl subtract the result from a second limb vector.
+
+dnl Copyright 2001, 2002, 2008, 2010 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P6 model 0-8,10-12 -
+C P6 model 9 (Banias) 6.8
+C P6 model 13 (Dothan) 6.9
+C P4 model 0-1 (Willamette) ?
+C P4 model 2 (Northwood) 5.87
+C P4 model 3-4 (Prescott) 6.5
+
+C This code represents a step forwards compared to the code available before
+C GMP 5.1, but it is not carefully tuned for either P6 or P4. In fact, it is
+C not good for P6. For P4 it saved a bit over 1 c/l for both Northwood and
+C Prescott compared to the old code.
+C
+C The arrangements made here to get a two instruction dependent chain are
+C slightly subtle. In the loop the carry (or borrow rather) is a negative so
+C that a paddq can be used to give a low limb ready to store, and a high limb
+C ready to become the new carry after a psrlq.
+C
+C If the carry was a simple twos complement negative then the psrlq shift would
+C need to bring in 0 bits or 1 bits according to whether the high was zero or
+C non-zero, since a non-zero value would represent a negative needing sign
+C extension. That wouldn't be particularly easy to arrange and certainly would
+C add an instruction to the dependent chain, so instead an offset is applied so
+C that the high limb will be 0xFFFFFFFF+c. With c in the range -0xFFFFFFFF to
+C 0, the value 0xFFFFFFFF+c is in the range 0 to 0xFFFFFFFF and is therefore
+C always positive and can always have 0 bits shifted in, which is what psrlq
+C does.
+C
+C The extra 0xFFFFFFFF must be subtracted before c is used, but that can be
+C done off the dependent chain. The total adjustment then is to add
+C 0xFFFFFFFF00000000 to offset the new carry, and subtract 0x00000000FFFFFFFF
+C to remove the offset from the current carry, for a net add of
+C 0xFFFFFFFE00000001. In the code this is applied to the destination limb when
+C fetched.
+C
+C It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement
+C negative, which is how it's undone for the return value, but that doesn't
+C seem as clear.
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_submul_1c)
+deflit(`FRAME',0)
+ movd PARAM_CARRY, %mm1
+ jmp L(start_1c)
+EPILOGUE()
+
+PROLOGUE(mpn_submul_1)
+deflit(`FRAME',0)
+ pxor %mm1, %mm1 C initial borrow
+
+L(start_1c):
+ mov PARAM_SRC, %eax
+ pcmpeqd %mm0, %mm0
+
+ movd PARAM_MULTIPLIER, %mm7
+ pcmpeqd %mm6, %mm6
+
+ mov PARAM_DST, %edx
+ psrlq $32, %mm0 C 0x00000000FFFFFFFF
+
+ mov PARAM_SIZE, %ecx
+ psllq $32, %mm6 C 0xFFFFFFFF00000000
+
+ psubq %mm0, %mm6 C 0xFFFFFFFE00000001
+
+ psubq %mm1, %mm0 C 0xFFFFFFFF - borrow
+
+
+ movd (%eax), %mm3 C up
+ movd (%edx), %mm4 C rp
+
+ add $-1, %ecx
+ paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
+ pmuludq %mm7, %mm3
+ jnz L(gt1)
+ psubq %mm3, %mm4 C prod
+ paddq %mm4, %mm0 C borrow
+ movd %mm0, (%edx) C result
+ jmp L(rt)
+
+L(gt1): movd 4(%eax), %mm1 C up
+ movd 4(%edx), %mm2 C rp
+
+ add $-1, %ecx
+ jz L(eev)
+
+ ALIGN(16)
+L(top): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001
+ pmuludq %mm7, %mm1
+ psubq %mm3, %mm4 C prod
+ movd 8(%eax), %mm3 C up
+ paddq %mm4, %mm0 C borrow
+ movd 8(%edx), %mm4 C rp
+ movd %mm0, (%edx) C result
+ psrlq $32, %mm0
+
+ add $-1, %ecx
+ jz L(eod)
+
+ paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
+ pmuludq %mm7, %mm3
+ psubq %mm1, %mm2 C prod
+ movd 12(%eax), %mm1 C up
+ paddq %mm2, %mm0 C borrow
+ movd 12(%edx), %mm2 C rp
+ movd %mm0, 4(%edx) C result
+ psrlq $32, %mm0
+
+ lea 8(%eax), %eax
+ lea 8(%edx), %edx
+ add $-1, %ecx
+ jnz L(top)
+
+
+L(eev): paddq %mm6, %mm2 C add 0xFFFFFFFE00000001
+ pmuludq %mm7, %mm1
+ psubq %mm3, %mm4 C prod
+ paddq %mm4, %mm0 C borrow
+ movd %mm0, (%edx) C result
+ psrlq $32, %mm0
+ psubq %mm1, %mm2 C prod
+ paddq %mm2, %mm0 C borrow
+ movd %mm0, 4(%edx) C result
+L(rt): psrlq $32, %mm0
+ movd %mm0, %eax
+ not %eax
+ emms
+ ret
+
+L(eod): paddq %mm6, %mm4 C add 0xFFFFFFFE00000001
+ pmuludq %mm7, %mm3
+ psubq %mm1, %mm2 C prod
+ paddq %mm2, %mm0 C borrow
+ movd %mm0, 4(%edx) C result
+ psrlq $32, %mm0
+ psubq %mm3, %mm4 C prod
+ paddq %mm4, %mm0 C borrow
+ movd %mm0, 8(%edx) C result
+ jmp L(rt)
+EPILOGUE()