aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm')
-rw-r--r--gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm217
1 files changed, 217 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm b/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm
new file mode 100644
index 0000000..ff3a184
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/k8/aorrlsh_n.asm
@@ -0,0 +1,217 @@
+dnl AMD64 mpn_addlsh_n and mpn_rsblsh_n. R = V2^k +- U.
+
+dnl Copyright 2006, 2010-2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C AMD K8,K9 2.87 < 3.85 for lshift + add_n
+C AMD K10 2.75 < 3.85 for lshift + add_n
+C Intel P4 22 > 7.33 for lshift + add_n
+C Intel core2 4.1 > 3.27 for lshift + add_n
+C Intel NHM 4.4 > 3.75 for lshift + add_n
+C Intel SBR 3.17 < 3.46 for lshift + add_n
+C Intel atom ? ? 8.75 for lshift + add_n
+C VIA nano 4.7 < 6.25 for lshift + add_n
+
+C TODO
+C * Can we propagate carry into rdx instead of using a special carry register?
+C That could save enough insns to get to 10 cycles/iteration.
+
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`vp_param', `%rdx')
+define(`n_param', `%rcx')
+define(`cnt', `%r8')
+
+define(`vp', `%r12')
+define(`n', `%rbp')
+
+ifdef(`OPERATION_addlsh_n',`
+ define(ADDSUB, `add')
+ define(ADCSBB, `adc')
+ define(func, mpn_addlsh_n)
+')
+ifdef(`OPERATION_rsblsh_n',`
+ define(ADDSUB, `sub')
+ define(ADCSBB, `sbb')
+ define(func, mpn_rsblsh_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(func)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8d ')
+ push %r12
+ push %rbp
+ push %rbx
+
+ mov (vp_param), %rax C load first V limb early
+
+ mov $0, R32(n)
+ sub n_param, n
+
+ lea -16(up,n_param,8), up
+ lea -16(rp,n_param,8), rp
+ lea 16(vp_param,n_param,8), vp
+
+ mov n_param, %r9
+
+ mov %r8, %rcx
+ mov $1, R32(%r8)
+ shl R8(%rcx), %r8
+
+ mul %r8 C initial multiply
+
+ and $3, R32(%r9)
+ jz L(b0)
+ cmp $2, R32(%r9)
+ jc L(b1)
+ jz L(b2)
+
+L(b3): mov %rax, %r11
+ ADDSUB 16(up,n,8), %r11
+ mov -8(vp,n,8), %rax
+ sbb R32(%rcx), R32(%rcx)
+ mov %rdx, %rbx
+ mul %r8
+ or %rax, %rbx
+ mov (vp,n,8), %rax
+ mov %rdx, %r9
+ mul %r8
+ or %rax, %r9
+ add $3, n
+ jnz L(lo3)
+ jmp L(cj3)
+
+L(b2): mov %rax, %rbx
+ mov -8(vp,n,8), %rax
+ mov %rdx, %r9
+ mul %r8
+ or %rax, %r9
+ add $2, n
+ jz L(cj2)
+ mov %rdx, %r10
+ mov -16(vp,n,8), %rax
+ mul %r8
+ or %rax, %r10
+ xor R32(%rcx), R32(%rcx) C clear carry register
+ jmp L(lo2)
+
+L(b1): mov %rax, %r9
+ mov %rdx, %r10
+ add $1, n
+ jnz L(gt1)
+ ADDSUB 8(up,n,8), %r9
+ jmp L(cj1)
+L(gt1): mov -16(vp,n,8), %rax
+ mul %r8
+ or %rax, %r10
+ mov %rdx, %r11
+ mov -8(vp,n,8), %rax
+ mul %r8
+ or %rax, %r11
+ ADDSUB 8(up,n,8), %r9
+ ADCSBB 16(up,n,8), %r10
+ ADCSBB 24(up,n,8), %r11
+ mov (vp,n,8), %rax
+ sbb R32(%rcx), R32(%rcx)
+ jmp L(lo1)
+
+L(b0): mov %rax, %r10
+ mov %rdx, %r11
+ mov -8(vp,n,8), %rax
+ mul %r8
+ or %rax, %r11
+ ADDSUB 16(up,n,8), %r10
+ ADCSBB 24(up,n,8), %r11
+ mov (vp,n,8), %rax
+ sbb R32(%rcx), R32(%rcx)
+ mov %rdx, %rbx
+ mul %r8
+ or %rax, %rbx
+ mov 8(vp,n,8), %rax
+ add $4, n
+ jz L(end)
+
+ ALIGN(8)
+L(top): mov %rdx, %r9
+ mul %r8
+ or %rax, %r9
+ mov %r10, -16(rp,n,8)
+L(lo3): mov %rdx, %r10
+ mov -16(vp,n,8), %rax
+ mul %r8
+ or %rax, %r10
+ mov %r11, -8(rp,n,8)
+L(lo2): mov %rdx, %r11
+ mov -8(vp,n,8), %rax
+ mul %r8
+ or %rax, %r11
+ add R32(%rcx), R32(%rcx)
+ ADCSBB (up,n,8), %rbx
+ ADCSBB 8(up,n,8), %r9
+ ADCSBB 16(up,n,8), %r10
+ ADCSBB 24(up,n,8), %r11
+ mov (vp,n,8), %rax
+ sbb R32(%rcx), R32(%rcx)
+ mov %rbx, (rp,n,8)
+L(lo1): mov %rdx, %rbx
+ mul %r8
+ or %rax, %rbx
+ mov %r9, 8(rp,n,8)
+L(lo0): mov 8(vp,n,8), %rax
+ add $4, n
+ jnz L(top)
+
+L(end): mov %rdx, %r9
+ mul %r8
+ or %rax, %r9
+ mov %r10, -16(rp,n,8)
+L(cj3): mov %r11, -8(rp,n,8)
+L(cj2): add R32(%rcx), R32(%rcx)
+ ADCSBB (up,n,8), %rbx
+ ADCSBB 8(up,n,8), %r9
+ mov %rbx, (rp,n,8)
+L(cj1): mov %r9, 8(rp,n,8)
+ mov %rdx, %rax
+ ADCSBB $0, %rax
+ pop %rbx
+ pop %rbp
+ pop %r12
+ FUNC_EXIT()
+ ret
+EPILOGUE()