From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm | 158 +++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm (limited to 'gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm b/gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm new file mode 100644 index 0000000..5e59a0a --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/div_qr_2n_pi1.asm @@ -0,0 +1,158 @@ +dnl x86-64 mpn_div_qr_2n_pi1 +dnl -- Divide an mpn number by a normalized 2-limb number, +dnl using a single-limb inverse. + +dnl Copyright 2007, 2008, 2010-2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C c/l +C INPUT PARAMETERS +define(`qp', `%rdi') +define(`rp', `%rsi') +define(`up_param', `%rdx') +define(`un', `%rcx') +define(`d1', `%r8') +define(`d0', `%r9') +define(`di_param', `8(%rsp)') + +define(`di', `%r10') +define(`up', `%r11') +define(`u2', `%rbx') +define(`u1', `%r12') +define(`t1', `%r13') +define(`t0', `%r14') +define(`md1', `%r15') + +C TODO +C * Store qh in the same stack slot as di_param, instead of pushing +C it. (we could put it in register %rbp, but then we would need to +C save and restore that instead, which doesn't seem like a win). + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_div_qr_2n_pi1) + FUNC_ENTRY(4) +IFDOS(` mov 56(%rsp), %r8 ') +IFDOS(` mov 64(%rsp), %r9 ') +IFDOS(`define(`di_param', `72(%rsp)')') + mov di_param, di + mov up_param, up + push %r15 + push %r14 + push %r13 + push %r12 + push %rbx + + mov -16(up, un, 8), u1 + mov -8(up, un, 8), u2 + + mov u1, t0 + mov u2, t1 + sub d0, t0 + sbb d1, t1 + cmovnc t0, u1 + cmovnc t1, u2 + C push qh which is !carry + sbb %rax, %rax + inc %rax + push %rax + lea -2(un), un + mov d1, md1 + neg md1 + + jmp L(next) + + ALIGN(16) +L(loop): + C udiv_qr_3by2 (q,u2,u1,u2,u1,n0, d1,d0,di) + C Based on the optimized divrem_2.asm code. + + mov di, %rax + mul u2 + mov u1, t0 + add %rax, t0 C q0 in t0 + adc u2, %rdx + mov %rdx, t1 C q in t1 + imul md1, %rdx + mov d0, %rax + lea (%rdx, u1), u2 + mul t1 + mov (up, un, 8), u1 + sub d0, u1 + sbb d1, u2 + sub %rax, u1 + sbb %rdx, u2 + xor R32(%rax), R32(%rax) + xor R32(%rdx), R32(%rdx) + cmp t0, u2 + cmovnc d0, %rax + cmovnc d1, %rdx + adc $0, t1 + nop + add %rax, u1 + adc %rdx, u2 + cmp d1, u2 + jae L(fix) +L(bck): + mov t1, (qp, un, 8) +L(next): + sub $1, un + jnc L(loop) +L(end): + mov u2, 8(rp) + mov u1, (rp) + + C qh on stack + pop %rax + + pop %rbx + pop %r12 + pop %r13 + pop %r14 + pop %r15 + FUNC_EXIT() + ret + +L(fix): C Unlikely update. u2 >= d1 + seta %dl + cmp d0, u1 + setae %al + orb %dl, %al C "orb" form to placate Sun tools + je L(bck) + inc t1 + sub d0, u1 + sbb d1, u2 + jmp L(bck) +EPILOGUE() -- cgit v1.2.3