From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001
From: Duncan Wilkie <antigravityd@gmail.com>
Date: Sat, 18 Nov 2023 06:11:09 -0600
Subject: Initial commit.

---
 gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm | 173 ++++++++++++++++++++++++++++++++
 1 file changed, 173 insertions(+)
 create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm

(limited to 'gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm')

diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm b/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm
new file mode 100644
index 0000000..6a17b93
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastsse/lshift.asm
@@ -0,0 +1,173 @@
+dnl  AMD64 mpn_lshift optimised for CPUs with fast SSE.
+
+dnl  Contributed to the GNU project by David Harvey and Torbjorn Granlund.
+
+dnl  Copyright 2010-2012, 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb	     cycles/limb	      good
+C          16-byte aligned         16-byte unaligned	    for cpu?
+C AMD K8,K9	 ?			 ?
+C AMD K10	 1.68  (1.45)		 1.75  (1.49)		Y
+C AMD bd1	 1.82  (1.75)		 1.82  (1.75)		Y
+C AMD bobcat	 4			 4
+C Intel P4	 3     (2.7)		 3     (2.7)		Y
+C Intel core2	 2.05  (1.67)		 2.55  (1.75)
+C Intel NHM	 2.05  (1.75)		 2.09  (2)
+C Intel SBR	 1.5   (1.3125)		 1.5   (1.4375)		Y
+C Intel atom	 ?			 ?
+C VIA nano	 2.25  (2)		 2.5   (2)		Y
+
+C We try to do as many 16-byte operations as possible.  The top-most and
+C bottom-most writes might need 8-byte operations.
+
+C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
+C not true.  The aligned case reads 16+8 bytes, the unaligned case reads
+C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
+
+C This is not yet great code:
+C   (1) The unaligned case makes many reads.
+C   (2) We should do some unrolling, at least 2-way.
+C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
+C Nano.
+
+C INPUT PARAMETERS
+define(`rp',  `%rdi')
+define(`ap',  `%rsi')
+define(`n',   `%rdx')
+define(`cnt', `%rcx')
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_lshift)
+	FUNC_ENTRY(4)
+	movd	R32(%rcx), %xmm4
+	mov	$64, R32(%rax)
+	sub	R32(%rcx), R32(%rax)
+	movd	R32(%rax), %xmm5
+
+	neg	R32(%rcx)
+	mov	-8(ap,n,8), %rax
+	shr	R8(%rcx), %rax
+
+	cmp	$2, n
+	jle	L(le2)
+
+	lea	(rp,n,8), R32(%rcx)
+	test	$8, R8(%rcx)
+	je	L(rp_aligned)
+
+C Do one initial limb in order to make rp aligned
+	movq	-8(ap,n,8), %xmm0
+	movq	-16(ap,n,8), %xmm1
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	movq	%xmm0, -8(rp,n,8)
+	dec	n
+
+L(rp_aligned):
+	lea	(ap,n,8), R32(%rcx)
+	test	$8, R8(%rcx)
+	je	L(aent)
+	jmp	L(uent)
+C *****************************************************************************
+
+C Handle the case when ap != rp (mod 16).
+
+	ALIGN(16)
+L(utop):movdqa	-8(ap,n,8), %xmm0
+	movq	(ap,n,8), %xmm1
+	punpcklqdq  8(ap,n,8), %xmm1
+	psllq	%xmm4, %xmm1
+	psrlq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, (rp,n,8)
+L(uent):sub	$2, n
+	ja	L(utop)
+
+	jne	L(end8)
+
+	movq	(ap), %xmm1
+	pxor	%xmm0, %xmm0
+	punpcklqdq  %xmm1, %xmm0
+	punpcklqdq  8(ap), %xmm1
+	psllq	%xmm4, %xmm1
+	psrlq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, (rp)
+	FUNC_EXIT()
+	ret
+C *****************************************************************************
+
+C Handle the case when ap = rp (mod 16).
+
+	ALIGN(16)
+L(atop):movdqa	(ap,n,8), %xmm0		C xmm0 = B*ap[n-1] + ap[n-2]
+	movq	-8(ap,n,8), %xmm1	C xmm1 = ap[n-3]
+	punpcklqdq  %xmm0, %xmm1	C xmm1 = B*ap[n-2] + ap[n-3]
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, (rp,n,8)
+L(aent):
+	sub	$2, n
+	ja	L(atop)
+	jne	L(end8)
+
+	movdqa	(ap), %xmm1
+	pxor	%xmm0, %xmm0
+	punpcklqdq  %xmm1, %xmm0
+	psllq	%xmm4, %xmm1
+	psrlq	%xmm5, %xmm0
+	por	%xmm1, %xmm0
+	movdqa	%xmm0, (rp)
+	FUNC_EXIT()
+	ret
+C *****************************************************************************
+
+	ALIGN(16)
+L(le2):	jne	L(end8)
+
+	movq	8(ap), %xmm0
+	movq	(ap), %xmm1
+	psllq	%xmm4, %xmm0
+	psrlq	%xmm5, %xmm1
+	por	%xmm1, %xmm0
+	movq	%xmm0, 8(rp)
+
+L(end8):movq	(ap), %xmm0
+	psllq	%xmm4, %xmm0
+	movq	%xmm0, (rp)
+	FUNC_EXIT()
+	ret
+EPILOGUE()
-- 
cgit v1.2.3