From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm | 215 +++++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm (limited to 'gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm') diff --git a/gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm b/gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm new file mode 100644 index 0000000..75421a6 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/mod_34lsub1.asm @@ -0,0 +1,215 @@ +dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. + +dnl Copyright 2000-2002, 2004, 2005, 2007, 2009-2012 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C AMD K8,K9 0.67 0.583 is possible with zero-reg instead of $0, 4-way +C AMD K10 0.67 this seems hard to beat +C AMD bd1 1 +C AMD bd2 1 +C AMD bd3 ? +C AMD bd4 ? +C AMD zen 0.62 +C AMD bobcat 1.07 +C AMD jaguar 1 +C Intel P4 7.35 terrible, use old code +C Intel core2 1.25 1+epsilon with huge unrolling +C Intel NHM 1.15 this seems hard to beat +C Intel SBR 0.93 +C Intel IBR 0.93 +C Intel HWL 0.82 +C Intel BWL 0.64 +C Intel SKY 0.60 +C Intel atom 2.5 +C Intel SLM 1.59 +C VIA nano 1.25 this seems hard to beat + +C INPUT PARAMETERS +define(`ap', %rdi) +define(`n', %rsi) + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) + +C TODO +C * Review feed-in and wind-down code. + +ABI_SUPPORT(DOS64) +ABI_SUPPORT(STD64) + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mod_34lsub1) + FUNC_ENTRY(2) + + mov $0x0000FFFFFFFFFFFF, %r11 + + mov (ap), %rax + + cmp $2, %rsi + ja L(gt2) + + jb L(one) + + mov 8(ap), %rsi + mov %rax, %rdx + shr $48, %rax C src[0] low + + and %r11, %rdx C src[0] high + add %rdx, %rax + mov R32(%rsi), R32(%rdx) + + shr $32, %rsi C src[1] high + add %rsi, %rax + + shl $16, %rdx C src[1] low + add %rdx, %rax +L(one): FUNC_EXIT() + ret + + +C Don't change this, the wind-down code is not able to handle greater values +define(UNROLL,3) + +L(gt2): mov 8(ap), %rcx + mov 16(ap), %rdx + xor %r9, %r9 + add $24, ap + sub $eval(UNROLL*3+3), %rsi + jc L(end) + ALIGN(16) +L(top): + add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 +forloop(i,1,UNROLL-1,`dnl + add eval(i*24)(ap), %rax + adc eval(i*24+8)(ap), %rcx + adc eval(i*24+16)(ap), %rdx + adc $0, %r9 +')dnl + add $eval(UNROLL*24), ap + sub $eval(UNROLL*3), %rsi + jnc L(top) + +L(end): + lea L(tab)(%rip), %r8 +ifdef(`PIC', +` movslq 36(%r8,%rsi,4), %r10 + add %r10, %r8 + jmp *%r8 +',` + jmp *72(%r8,%rsi,8) +') + JUMPTABSECT + ALIGN(8) +L(tab): JMPENT( L(0), L(tab)) + JMPENT( L(1), L(tab)) + JMPENT( L(2), L(tab)) + JMPENT( L(3), L(tab)) + JMPENT( L(4), L(tab)) + JMPENT( L(5), L(tab)) + JMPENT( L(6), L(tab)) + JMPENT( L(7), L(tab)) + JMPENT( L(8), L(tab)) + TEXT + +L(6): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +L(3): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + jmp L(cj1) + +L(7): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +L(4): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +L(1): add (ap), %rax + adc $0, %rcx + jmp L(cj2) + +L(8): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +L(5): add (ap), %rax + adc 8(ap), %rcx + adc 16(ap), %rdx + adc $0, %r9 + add $24, ap +L(2): add (ap), %rax + adc 8(ap), %rcx + +L(cj2): adc $0, %rdx +L(cj1): adc $0, %r9 +L(0): add %r9, %rax + adc $0, %rcx + adc $0, %rdx + adc $0, %rax + + mov %rax, %rdi C 0mod3 + shr $48, %rax C 0mod3 high + + and %r11, %rdi C 0mod3 low + mov R32(%rcx), R32(%r10) C 1mod3 + + shr $32, %rcx C 1mod3 high + + add %rdi, %rax C apply 0mod3 low + movzwl %dx, R32(%rdi) C 2mod3 + shl $16, %r10 C 1mod3 low + + add %rcx, %rax C apply 1mod3 high + shr $16, %rdx C 2mod3 high + + add %r10, %rax C apply 1mod3 low + shl $32, %rdi C 2mod3 low + + add %rdx, %rax C apply 2mod3 high + add %rdi, %rax C apply 2mod3 low + + FUNC_EXIT() + ret +EPILOGUE() -- cgit v1.2.3