aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86_64/divrem_2.asm
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/divrem_2.asm')
-rw-r--r--gmp-6.3.0/mpn/x86_64/divrem_2.asm192
1 files changed, 192 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/divrem_2.asm b/gmp-6.3.0/mpn/x86_64/divrem_2.asm
new file mode 100644
index 0000000..20811cc
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/divrem_2.asm
@@ -0,0 +1,192 @@
+dnl x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl Copyright 2007, 2008, 2010, 2014 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb best
+C AMD K8,K9 18
+C AMD K10 18
+C AMD bull
+C AMD pile
+C AMD bobcat
+C AMD jaguar
+C Intel P4 68
+C Intel core 34
+C Intel NHM 30.25
+C Intel SBR 21.3
+C Intel IBR 21.4
+C Intel HWL 20.6
+C Intel BWL
+C Intel atom 73
+C VIA nano 33
+
+
+C INPUT PARAMETERS
+define(`qp', `%rdi')
+define(`fn', `%rsi')
+define(`up_param', `%rdx')
+define(`un_param', `%rcx')
+define(`dp', `%r8')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_divrem_2)
+ FUNC_ENTRY(4)
+IFDOS(` mov 56(%rsp), %r8 ')
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+ lea -24(%rdx,%rcx,8), %r12 C r12 = &up[un-1]
+ mov %rsi, %r13
+ push %rbp
+ mov %rdi, %rbp
+ push %rbx
+ mov 8(%r8), %r11 C d1
+ mov 16(%r12), %rbx
+ mov (%r8), %r8 C d0
+ mov 8(%r12), %r10
+
+ xor R32(%r15), R32(%r15)
+ cmp %rbx, %r11
+ ja L(2)
+ setb %dl
+ cmp %r10, %r8
+ setbe %al
+ orb %al, %dl C "orb" form to placate Sun tools
+ je L(2)
+ inc R32(%r15)
+ sub %r8, %r10
+ sbb %r11, %rbx
+L(2):
+ lea -3(%rcx,%r13), %r14 C un + fn - 3
+ test %r14, %r14
+ js L(end)
+
+ push %r8
+ push %r10
+ push %r11
+IFSTD(` mov %r11, %rdi ')
+IFDOS(` mov %r11, %rcx ')
+IFDOS(` sub $32, %rsp ')
+ ASSERT(nz, `test $15, %rsp')
+ CALL( mpn_invert_limb)
+IFDOS(` add $32, %rsp ')
+ pop %r11
+ pop %r10
+ pop %r8
+
+ mov %r11, %rdx
+ mov %rax, %rdi
+ imul %rax, %rdx
+ mov %rdx, %r9
+ mul %r8
+ xor R32(%rcx), R32(%rcx)
+ add %r8, %r9
+ adc $-1, %rcx
+ add %rdx, %r9
+ adc $0, %rcx
+ js 2f
+1: dec %rdi
+ sub %r11, %r9
+ sbb $0, %rcx
+ jns 1b
+2:
+
+ lea (%rbp,%r14,8), %rbp
+ mov %r11, %rsi
+ neg %rsi C -d1
+
+C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
+C n2 un -d1 dinv qp d0 q0 d1 up fn msl
+
+ ALIGN(16)
+L(top): mov %rdi, %rax C di ncp
+ mul %rbx C 0, 17
+ mov %r10, %rcx C
+ add %rax, %rcx C 4
+ adc %rbx, %rdx C 5
+ mov %rdx, %r9 C q 6
+ imul %rsi, %rdx C 6
+ mov %r8, %rax C ncp
+ lea (%rdx, %r10), %rbx C n1 -= ... 10
+ xor R32(%r10), R32(%r10) C
+ mul %r9 C 7
+ cmp %r14, %r13 C
+ jg L(19) C
+ mov (%r12), %r10 C
+ sub $8, %r12 C
+L(19): sub %r8, %r10 C ncp
+ sbb %r11, %rbx C 11
+ sub %rax, %r10 C 11
+ sbb %rdx, %rbx C 12
+ xor R32(%rax), R32(%rax) C
+ xor R32(%rdx), R32(%rdx) C
+ cmp %rcx, %rbx C 13
+ cmovnc %r8, %rax C 14
+ cmovnc %r11, %rdx C 14
+ adc $0, %r9 C adjust q 14
+ nop
+ add %rax, %r10 C 15
+ adc %rdx, %rbx C 16
+ cmp %r11, %rbx C
+ jae L(fix) C
+L(bck): mov %r9, (%rbp) C
+ sub $8, %rbp C
+ dec %r14
+ jns L(top)
+
+L(end): mov %r10, 8(%r12)
+ mov %rbx, 16(%r12)
+ pop %rbx
+ pop %rbp
+ pop %r12
+ pop %r13
+ pop %r14
+ mov %r15, %rax
+ pop %r15
+ FUNC_EXIT()
+ ret
+
+L(fix): seta %dl
+ cmp %r8, %r10
+ setae %al
+ orb %dl, %al C "orb" form to placate Sun tools
+ je L(bck)
+ inc %r9
+ sub %r8, %r10
+ sbb %r11, %rbx
+ jmp L(bck)
+EPILOGUE()