From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm | 173 ++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm (limited to 'gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm') diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm b/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm new file mode 100644 index 0000000..6a17b93 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm @@ -0,0 +1,173 @@ +dnl AMD64 mpn_lshift optimised for CPUs with fast SSE. + +dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund. + +dnl Copyright 2010-2012, 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb cycles/limb good +C 16-byte aligned 16-byte unaligned for cpu? +C AMD K8,K9 ? ? +C AMD K10 1.68 (1.45) 1.75 (1.49) Y +C AMD bd1 1.82 (1.75) 1.82 (1.75) Y +C AMD bobcat 4 4 +C Intel P4 3 (2.7) 3 (2.7) Y +C Intel core2 2.05 (1.67) 2.55 (1.75) +C Intel NHM 2.05 (1.75) 2.09 (2) +C Intel SBR 1.5 (1.3125) 1.5 (1.4375) Y +C Intel atom ? ? +C VIA nano 2.25 (2) 2.5 (2) Y + +C We try to do as many 16-byte operations as possible. The top-most and +C bottom-most writes might need 8-byte operations. + +C There are two inner-loops, one for when rp = ap (mod 16) and one when this is +C not true. The aligned case reads 16+8 bytes, the unaligned case reads +C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented. + +C This is not yet great code: +C (1) The unaligned case makes many reads. +C (2) We should do some unrolling, at least 2-way. +C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on +C Nano. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`ap', `%rsi') +define(`n', `%rdx') +define(`cnt', `%rcx') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_lshift) + FUNC_ENTRY(4) + movd R32(%rcx), %xmm4 + mov $64, R32(%rax) + sub R32(%rcx), R32(%rax) + movd R32(%rax), %xmm5 + + neg R32(%rcx) + mov -8(ap,n,8), %rax + shr R8(%rcx), %rax + + cmp $2, n + jle L(le2) + + lea (rp,n,8), R32(%rcx) + test $8, R8(%rcx) + je L(rp_aligned) + +C Do one initial limb in order to make rp aligned + movq -8(ap,n,8), %xmm0 + movq -16(ap,n,8), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, -8(rp,n,8) + dec n + +L(rp_aligned): + lea (ap,n,8), R32(%rcx) + test $8, R8(%rcx) + je L(aent) + jmp L(uent) +C ***************************************************************************** + +C Handle the case when ap != rp (mod 16). + + ALIGN(16) +L(utop):movdqa -8(ap,n,8), %xmm0 + movq (ap,n,8), %xmm1 + punpcklqdq 8(ap,n,8), %xmm1 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, (rp,n,8) +L(uent):sub $2, n + ja L(utop) + + jne L(end8) + + movq (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + punpcklqdq 8(ap), %xmm1 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret +C ***************************************************************************** + +C Handle the case when ap = rp (mod 16). + + ALIGN(16) +L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2] + movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3] + punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3] + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movdqa %xmm0, (rp,n,8) +L(aent): + sub $2, n + ja L(atop) + jne L(end8) + + movdqa (ap), %xmm1 + pxor %xmm0, %xmm0 + punpcklqdq %xmm1, %xmm0 + psllq %xmm4, %xmm1 + psrlq %xmm5, %xmm0 + por %xmm1, %xmm0 + movdqa %xmm0, (rp) + FUNC_EXIT() + ret +C ***************************************************************************** + + ALIGN(16) +L(le2): jne L(end8) + + movq 8(ap), %xmm0 + movq (ap), %xmm1 + psllq %xmm4, %xmm0 + psrlq %xmm5, %xmm1 + por %xmm1, %xmm0 + movq %xmm0, 8(rp) + +L(end8):movq (ap), %xmm0 + psllq %xmm4, %xmm0 + movq %xmm0, (rp) + FUNC_EXIT() + ret +EPILOGUE() -- cgit v1.2.3