From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm | 167 ++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm (limited to 'gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm b/gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm new file mode 100644 index 0000000..f34b3f0 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/pentium4/mod_34lsub1.asm @@ -0,0 +1,167 @@ +dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. + +dnl Copyright 2000-2002, 2004, 2005, 2007, 2010-2012 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 1.0 +C AMD K10 1.12 +C Intel P4 3.25 +C Intel core2 1.5 +C Intel corei 1.5 +C Intel atom 2.5 +C VIA nano 1.75 + + +C INPUT PARAMETERS +define(`ap', %rdi) +define(`n', %rsi) + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) + +C TODO +C * Review feed-in and wind-down code. In particular, try to avoid adc and +C sbb to placate Pentium4. +C * It seems possible to reach 2.67 c/l by using a cleaner 6-way unrolling, +C without the dual loop exits. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mod_34lsub1) + FUNC_ENTRY(2) + + mov $0x0000FFFFFFFFFFFF, %r11 + + sub $2, %rsi + ja L(gt2) + + mov (ap), %rax + nop + jb L(1) + + mov 8(ap), %rsi + mov %rax, %rdx + shr $48, %rax C src[0] low + + and %r11, %rdx C src[0] high + add %rdx, %rax + mov R32(%rsi), R32(%rdx) + + shr $32, %rsi C src[1] high + add %rsi, %rax + + shl $16, %rdx C src[1] low + add %rdx, %rax + +L(1): FUNC_EXIT() + ret + + + ALIGN(16) +L(gt2): xor R32(%rax), R32(%rax) + xor R32(%rcx), R32(%rcx) + xor R32(%rdx), R32(%rdx) + xor %r8, %r8 + xor %r9, %r9 + xor %r10, %r10 + +L(top): add (ap), %rax + adc $0, %r10 + add 8(ap), %rcx + adc $0, %r8 + add 16(ap), %rdx + adc $0, %r9 + + sub $3, %rsi + jng L(end) + + add 24(ap), %rax + adc $0, %r10 + add 32(ap), %rcx + adc $0, %r8 + add 40(ap), %rdx + lea 48(ap), ap + adc $0, %r9 + + sub $3, %rsi + jg L(top) + + + add $-24, ap +L(end): add %r9, %rax + adc %r10, %rcx + adc %r8, %rdx + + inc %rsi + mov $0x1, R32(%r10) + js L(combine) + + mov $0x10000, R32(%r10) + adc 24(ap), %rax + dec %rsi + js L(combine) + + adc 32(ap), %rcx + mov $0x100000000, %r10 + +L(combine): + sbb %rsi, %rsi C carry + mov %rax, %rdi C 0mod3 + shr $48, %rax C 0mod3 high + + and %r10, %rsi C carry masked + and %r11, %rdi C 0mod3 low + mov R32(%rcx), R32(%r10) C 1mod3 + + add %rsi, %rax C apply carry + shr $32, %rcx C 1mod3 high + + add %rdi, %rax C apply 0mod3 low + movzwl %dx, R32(%rdi) C 2mod3 + shl $16, %r10 C 1mod3 low + + add %rcx, %rax C apply 1mod3 high + shr $16, %rdx C 2mod3 high + + add %r10, %rax C apply 1mod3 low + shl $32, %rdi C 2mod3 low + + add %rdx, %rax C apply 2mod3 high + add %rdi, %rax C apply 2mod3 low + + FUNC_EXIT() + ret +EPILOGUE() -- cgit v1.2.3