From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001
From: Duncan Wilkie <antigravityd@gmail.com>
Date: Sat, 18 Nov 2023 06:11:09 -0600
Subject: Initial commit.

---
 gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm | 300 +++++++++++++++++++++++++
 1 file changed, 300 insertions(+)
 create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm

(limited to 'gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm')

diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm b/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm
new file mode 100644
index 0000000..9876a47
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastsse/copyi-palignr.asm
@@ -0,0 +1,300 @@
+dnl  AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3.
+
+dnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
+C AMD K8,K9	 2.0		 illop		1.0/1.0		N
+C AMD K10	 0.85		 illop				Y/N
+C AMD bd1	 0.70		 0.66				Y
+C AMD bd2	 0.68		 0.66				Y
+C AMD bd3	 ?		 ?
+C AMD bd4	 ?		 ?
+C AMD bt1	 1.97		 8.16		1.5/1.5		N
+C AMD bt2	 0.77		 0.93		0.65/opt	N/Y
+C AMD zn1	 ?		 ?
+C AMD zn2	 ?		 ?
+C Intel P4	 2.26		 illop				Y/N
+C Intel CNR	 0.52		 0.64		opt/opt		Y
+C Intel NHM	 0.52		 0.71		0.50/0.67	N
+C Intel SBR	 0.51		 0.54		opt/0.51	Y
+C Intel IBR	 0.50		 0.54		opt/opt		Y
+C Intel HWL	 0.50		 0.51		opt/opt		Y
+C Intel BWL	 0.55		 0.55		opt/opt		Y
+C Intel atom	 1.16		 1.61		opt/opt		Y
+C Intel SLM	 1.02		 1.07		opt/opt		Y
+C VIA nano	 1.09		 1.08		opt/opt		Y
+
+C We use only 16-byte operations, except for unaligned top-most and bottom-most
+C limbs.  We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).  That
+C instruction is better adapted to mpn_copyd's needs, we need to contort the
+C code to use it here.
+C
+C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop,
+C taken from the x86_64 default code.
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n',  `%rdx')
+
+C There are three instructions for loading an aligned 128-bit quantity.  We use
+C movaps, since it has the shortest coding.
+dnl define(`movdqa', ``movaps'')
+
+ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)')
+
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_copyi)
+	FUNC_ENTRY(3)
+
+	cmp	$COPYI_SSE_THRESHOLD, n
+	jbe	L(bc)
+
+	test	$8, R8(rp)		C is rp 16-byte aligned?
+	jz	L(rp_aligned)		C jump if rp aligned
+
+	movsq				C copy one limb
+	dec	n
+
+L(rp_aligned):
+	test	$8, R8(up)
+	jnz	L(uent)
+
+ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
+`	sub	$8, n',
+`	jmp	L(am)')
+
+	ALIGN(16)
+L(atop):movdqa	0(up), %xmm0
+	movdqa	16(up), %xmm1
+	movdqa	32(up), %xmm2
+	movdqa	48(up), %xmm3
+	lea	64(up), up
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, 16(rp)
+	movdqa	%xmm2, 32(rp)
+	movdqa	%xmm3, 48(rp)
+	lea	64(rp), rp
+L(am):	sub	$8, n
+	jnc	L(atop)
+
+	test	$4, R8(n)
+	jz	1f
+	movdqa	(up), %xmm0
+	movdqa	16(up), %xmm1
+	lea	32(up), up
+	movdqa	%xmm0, (rp)
+	movdqa	%xmm1, 16(rp)
+	lea	32(rp), rp
+
+1:	test	$2, R8(n)
+	jz	1f
+	movdqa	(up), %xmm0
+	lea	16(up), up
+	movdqa	%xmm0, (rp)
+	lea	16(rp), rp
+
+1:	test	$1, R8(n)
+	jz	1f
+	mov	(up), %r8
+	mov	%r8, (rp)
+
+1:	FUNC_EXIT()
+	ret
+
+L(uent):
+C Code handling up - rp = 8 (mod 16)
+
+	cmp	$16, n
+	jc	L(ued0)
+
+IFDOS(`	add	$-56, %rsp	')
+IFDOS(`	movdqa	%xmm6, (%rsp)	')
+IFDOS(`	movdqa	%xmm7, 16(%rsp)	')
+IFDOS(`	movdqa	%xmm8, 32(%rsp)	')
+
+	movaps	120(up), %xmm7
+	movaps	104(up), %xmm6
+	movaps	88(up), %xmm5
+	movaps	72(up), %xmm4
+	movaps	56(up), %xmm3
+	movaps	40(up), %xmm2
+	lea	128(up), up
+	sub	$32, n
+	jc	L(ued1)
+
+	ALIGN(16)
+L(utop):movaps	-104(up), %xmm1
+	sub	$16, n
+	movaps	-120(up), %xmm0
+	palignr($8, %xmm6, %xmm7)
+	movaps	-136(up), %xmm8
+	movdqa	%xmm7, 112(rp)
+	palignr($8, %xmm5, %xmm6)
+	movaps	120(up), %xmm7
+	movdqa	%xmm6, 96(rp)
+	palignr($8, %xmm4, %xmm5)
+	movaps	104(up), %xmm6
+	movdqa	%xmm5, 80(rp)
+	palignr($8, %xmm3, %xmm4)
+	movaps	88(up), %xmm5
+	movdqa	%xmm4, 64(rp)
+	palignr($8, %xmm2, %xmm3)
+	movaps	72(up), %xmm4
+	movdqa	%xmm3, 48(rp)
+	palignr($8, %xmm1, %xmm2)
+	movaps	56(up), %xmm3
+	movdqa	%xmm2, 32(rp)
+	palignr($8, %xmm0, %xmm1)
+	movaps	40(up), %xmm2
+	movdqa	%xmm1, 16(rp)
+	palignr($8, %xmm8, %xmm0)
+	lea	128(up), up
+	movdqa	%xmm0, (rp)
+	lea	128(rp), rp
+	jnc	L(utop)
+
+L(ued1):movaps	-104(up), %xmm1
+	movaps	-120(up), %xmm0
+	movaps	-136(up), %xmm8
+	palignr($8, %xmm6, %xmm7)
+	movdqa	%xmm7, 112(rp)
+	palignr($8, %xmm5, %xmm6)
+	movdqa	%xmm6, 96(rp)
+	palignr($8, %xmm4, %xmm5)
+	movdqa	%xmm5, 80(rp)
+	palignr($8, %xmm3, %xmm4)
+	movdqa	%xmm4, 64(rp)
+	palignr($8, %xmm2, %xmm3)
+	movdqa	%xmm3, 48(rp)
+	palignr($8, %xmm1, %xmm2)
+	movdqa	%xmm2, 32(rp)
+	palignr($8, %xmm0, %xmm1)
+	movdqa	%xmm1, 16(rp)
+	palignr($8, %xmm8, %xmm0)
+	movdqa	%xmm0, (rp)
+	lea	128(rp), rp
+
+IFDOS(`	movdqa	(%rsp), %xmm6	')
+IFDOS(`	movdqa	16(%rsp), %xmm7	')
+IFDOS(`	movdqa	32(%rsp), %xmm8	')
+IFDOS(`	add	$56, %rsp	')
+
+L(ued0):test	$8, R8(n)
+	jz	1f
+	movaps	56(up), %xmm3
+	movaps	40(up), %xmm2
+	movaps	24(up), %xmm1
+	movaps	8(up), %xmm0
+	movaps	-8(up), %xmm4
+	palignr($8, %xmm2, %xmm3)
+	movdqa	%xmm3, 48(rp)
+	palignr($8, %xmm1, %xmm2)
+	movdqa	%xmm2, 32(rp)
+	palignr($8, %xmm0, %xmm1)
+	movdqa	%xmm1, 16(rp)
+	palignr($8, %xmm4, %xmm0)
+	lea	64(up), up
+	movdqa	%xmm0, (rp)
+	lea	64(rp), rp
+
+1:	test	$4, R8(n)
+	jz	1f
+	movaps	24(up), %xmm1
+	movaps	8(up), %xmm0
+	palignr($8, %xmm0, %xmm1)
+	movaps	-8(up), %xmm3
+	movdqa	%xmm1, 16(rp)
+	palignr($8, %xmm3, %xmm0)
+	lea	32(up), up
+	movdqa	%xmm0, (rp)
+	lea	32(rp), rp
+
+1:	test	$2, R8(n)
+	jz	1f
+	movdqa	8(up), %xmm0
+	movdqa	-8(up), %xmm3
+	palignr($8, %xmm3, %xmm0)
+	lea	16(up), up
+	movdqa	%xmm0, (rp)
+	lea	16(rp), rp
+
+1:	test	$1, R8(n)
+	jz	1f
+	mov	(up), %r8
+	mov	%r8, (rp)
+
+1:	FUNC_EXIT()
+	ret
+
+C Basecase code.  Needed for good small operands speed, not for
+C correctness as the above code is currently written.
+
+L(bc):	lea	-8(rp), rp
+	sub	$4, R32(n)
+	jc	L(end)
+
+	ALIGN(16)
+L(top):	mov	(up), %r8
+	mov	8(up), %r9
+	lea	32(rp), rp
+	mov	16(up), %r10
+	mov	24(up), %r11
+	lea	32(up), up
+	mov	%r8, -24(rp)
+	mov	%r9, -16(rp)
+ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
+`	sub	$4, R32(n)')
+	mov	%r10, -8(rp)
+	mov	%r11, (rp)
+ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
+`	jnc	L(top)')
+
+L(end):	test	$1, R8(n)
+	jz	1f
+	mov	(up), %r8
+	mov	%r8, 8(rp)
+	lea	8(rp), rp
+	lea	8(up), up
+1:	test	$2, R8(n)
+	jz	1f
+	mov	(up), %r8
+	mov	8(up), %r9
+	mov	%r8, 8(rp)
+	mov	%r9, 16(rp)
+1:	FUNC_EXIT()
+	ret
+EPILOGUE()
-- 
cgit v1.2.3