From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/x86_64/mode1o.asm | 171 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100644 gmp-6.3.0/mpn/x86_64/mode1o.asm (limited to 'gmp-6.3.0/mpn/x86_64/mode1o.asm') diff --git a/gmp-6.3.0/mpn/x86_64/mode1o.asm b/gmp-6.3.0/mpn/x86_64/mode1o.asm new file mode 100644 index 0000000..2cd2b08 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mode1o.asm @@ -0,0 +1,171 @@ +dnl AMD64 mpn_modexact_1_odd -- Hensel norm remainder. + +dnl Copyright 2000-2006, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 10 +C AMD K10 10 +C Intel P4 33 +C Intel core2 13 +C Intel corei 14.5 +C Intel atom 35 +C VIA nano ? + + +C The dependent chain in the main loop is +C +C cycles +C sub %rdx, %rax 1 +C imul %r9, %rax 4 +C mul %r8 5 +C ---- +C total 10 +C +C The mov load from src seems to need to be scheduled back before the jz to +C achieve this speed, out-of-order execution apparently can't completely hide +C the latency otherwise. +C +C The l=src[i]-cbit step is rotated back too, since that allows us to avoid it +C for the first iteration (where there's no cbit). +C +C The code alignment used (32-byte) for the loop also seems necessary. Without +C that the non-PIC case has adc crossing the 0x60 offset, apparently making it +C run at 11 cycles instead of 10. + + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_modexact_1_odd) + FUNC_ENTRY(3) + mov $0, R32(%rcx) +IFDOS(` jmp L(ent) ') + +PROLOGUE(mpn_modexact_1c_odd) + FUNC_ENTRY(4) +L(ent): + C rdi src + C rsi size + C rdx divisor + C rcx carry + + mov %rdx, %r8 C d + shr R32(%rdx) C d/2 + + LEA( binvert_limb_table, %r9) + + and $127, R32(%rdx) + mov %rcx, %r10 C initial carry + + movzbl (%r9,%rdx), R32(%rdx) C inv 8 bits + + mov (%rdi), %rax C src[0] + lea (%rdi,%rsi,8), %r11 C src end + mov %r8, %rdi C d, made available to imull + + lea (%rdx,%rdx), R32(%rcx) C 2*inv + imul R32(%rdx), R32(%rdx) C inv*inv + + neg %rsi C -size + + imul R32(%rdi), R32(%rdx) C inv*inv*d + + sub R32(%rdx), R32(%rcx) C inv = 2*inv - inv*inv*d, 16 bits + + lea (%rcx,%rcx), R32(%rdx) C 2*inv + imul R32(%rcx), R32(%rcx) C inv*inv + + imul R32(%rdi), R32(%rcx) C inv*inv*d + + sub R32(%rcx), R32(%rdx) C inv = 2*inv - inv*inv*d, 32 bits + xor R32(%rcx), R32(%rcx) C initial cbit + + lea (%rdx,%rdx), %r9 C 2*inv + imul %rdx, %rdx C inv*inv + + imul %r8, %rdx C inv*inv*d + + sub %rdx, %r9 C inv = 2*inv - inv*inv*d, 64 bits + mov %r10, %rdx C initial climb + + ASSERT(e,` C d*inv == 1 mod 2^64 + mov %r8, %r10 + imul %r9, %r10 + cmp $1, %r10') + + inc %rsi + jz L(one) + + + ALIGN(16) +L(top): + C rax l = src[i]-cbit + C rcx new cbit, 0 or 1 + C rdx climb, high of last product + C rsi counter, limbs, negative + C rdi + C r8 divisor + C r9 inverse + C r11 src end ptr + + sub %rdx, %rax C l = src[i]-cbit - climb + + adc $0, %rcx C more cbit + imul %r9, %rax C q = l * inverse + + mul %r8 C climb = high (q * d) + + mov (%r11,%rsi,8), %rax C src[i+1] + sub %rcx, %rax C next l = src[i+1] - cbit + setc R8(%rcx) C new cbit + + inc %rsi + jnz L(top) + + +L(one): + sub %rdx, %rax C l = src[i]-cbit - climb + + adc $0, %rcx C more cbit + imul %r9, %rax C q = l * inverse + + mul %r8 C climb = high (q * d) + + lea (%rcx,%rdx), %rax C climb+cbit + FUNC_EXIT() + ret + +EPILOGUE(mpn_modexact_1c_odd) +EPILOGUE(mpn_modexact_1_odd) -- cgit v1.2.3