From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001
From: Duncan Wilkie <antigravityd@gmail.com>
Date: Sat, 18 Nov 2023 06:11:09 -0600
Subject: Initial commit.

---
 gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm | 204 +++++++++++++++++++++++++
 1 file changed, 204 insertions(+)
 create mode 100644 gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm

(limited to 'gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm')

diff --git a/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm b/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm
new file mode 100644
index 0000000..e7b7feb
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/fastsse/sec_tabselect.asm
@@ -0,0 +1,204 @@
+dnl  AMD64 SSE mpn_sec_tabselect.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C	     cycles/limb     cycles/limb     cycles/limb
+C	      ali,evn n	     unal,evn n	      other cases
+C AMD K8,K9	 1.65		1.65		 1.8
+C AMD K10	 0.78		0.78		 0.85
+C AMD bd1	 0.80		0.91		 1.25
+C AMD bobcat	 2.15		2.15		 2.37
+C Intel P4	 2.5		2.5		 2.95
+C Intel core2	 1.17		1.25		 1.25
+C Intel NHM	 0.87		0.90		 0.90
+C Intel SBR	 0.63		0.79		 0.77
+C Intel atom	 4.3		 4.3		 4.3	slower than plain code
+C VIA nano	 1.4		 5.1		 3.14	too alignment dependent
+
+C NOTES
+C  * We only honour the least significant 32 bits of the `which' and `nents'
+C    arguments to allow efficient code using just SSE2.  We would need to
+C    either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence.
+C  * We use movd for copying between xmm and plain registers, since old gas
+C    rejects movq.  But gas assembles movd as movq when given a 64-bit greg.
+
+define(`rp',     `%rdi')
+define(`tp',     `%rsi')
+define(`n',      `%rdx')
+define(`nents',  `%rcx')
+define(`which',  `%r8')
+
+define(`i',      `%r10')
+define(`j',      `%r9')
+
+C rax  rbx  rcx  rdx  rdi  rsi  rbp   r8   r9  r10  r11  r12  r13  r14  r15
+C          nents  n   rp   tab       which j    i   temp  *    *    *    *
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sec_tabselect)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+
+IFDOS(`	add	$-88, %rsp	')
+IFDOS(`	movdqu	%xmm6, (%rsp)	')
+IFDOS(`	movdqu	%xmm7, 16(%rsp)	')
+IFDOS(`	movdqu	%xmm8, 32(%rsp)	')
+IFDOS(`	movdqu	%xmm9, 48(%rsp)	')
+
+	movd	which, %xmm8
+	pshufd	$0, %xmm8, %xmm8	C 4 `which' copies
+	mov	$1, R32(%rax)
+	movd	%rax, %xmm9
+	pshufd	$0, %xmm9, %xmm9	C 4 copies of 1
+
+	mov	n, j
+	add	$-8, j
+	js	L(outer_end)
+
+L(outer_top):
+	mov	nents, i
+	mov	tp, %r11
+	pxor	%xmm1, %xmm1
+	pxor	%xmm4, %xmm4
+	pxor	%xmm5, %xmm5
+	pxor	%xmm6, %xmm6
+	pxor	%xmm7, %xmm7
+	ALIGN(16)
+L(top):	movdqa	%xmm8, %xmm0
+	pcmpeqd	%xmm1, %xmm0
+	paddd	%xmm9, %xmm1
+	movdqu	0(tp), %xmm2
+	movdqu	16(tp), %xmm3
+	pand	%xmm0, %xmm2
+	pand	%xmm0, %xmm3
+	por	%xmm2, %xmm4
+	por	%xmm3, %xmm5
+	movdqu	32(tp), %xmm2
+	movdqu	48(tp), %xmm3
+	pand	%xmm0, %xmm2
+	pand	%xmm0, %xmm3
+	por	%xmm2, %xmm6
+	por	%xmm3, %xmm7
+	lea	(tp,n,8), tp
+	add	$-1, i
+	jne	L(top)
+
+	movdqu	%xmm4, 0(rp)
+	movdqu	%xmm5, 16(rp)
+	movdqu	%xmm6, 32(rp)
+	movdqu	%xmm7, 48(rp)
+
+	lea	64(%r11), tp
+	lea	64(rp), rp
+	add	$-8, j
+	jns	L(outer_top)
+L(outer_end):
+
+	test	$4, R8(n)
+	je	L(b0xx)
+L(b1xx):mov	nents, i
+	mov	tp, %r11
+	pxor	%xmm1, %xmm1
+	pxor	%xmm4, %xmm4
+	pxor	%xmm5, %xmm5
+	ALIGN(16)
+L(tp4):	movdqa	%xmm8, %xmm0
+	pcmpeqd	%xmm1, %xmm0
+	paddd	%xmm9, %xmm1
+	movdqu	0(tp), %xmm2
+	movdqu	16(tp), %xmm3
+	pand	%xmm0, %xmm2
+	pand	%xmm0, %xmm3
+	por	%xmm2, %xmm4
+	por	%xmm3, %xmm5
+	lea	(tp,n,8), tp
+	add	$-1, i
+	jne	L(tp4)
+	movdqu	%xmm4, 0(rp)
+	movdqu	%xmm5, 16(rp)
+	lea	32(%r11), tp
+	lea	32(rp), rp
+
+L(b0xx):test	$2, R8(n)
+	je	L(b00x)
+L(b01x):mov	nents, i
+	mov	tp, %r11
+	pxor	%xmm1, %xmm1
+	pxor	%xmm4, %xmm4
+	ALIGN(16)
+L(tp2):	movdqa	%xmm8, %xmm0
+	pcmpeqd	%xmm1, %xmm0
+	paddd	%xmm9, %xmm1
+	movdqu	0(tp), %xmm2
+	pand	%xmm0, %xmm2
+	por	%xmm2, %xmm4
+	lea	(tp,n,8), tp
+	add	$-1, i
+	jne	L(tp2)
+	movdqu	%xmm4, 0(rp)
+	lea	16(%r11), tp
+	lea	16(rp), rp
+
+L(b00x):test	$1, R8(n)
+	je	L(b000)
+L(b001):mov	nents, i
+	mov	tp, %r11
+	pxor	%xmm1, %xmm1
+	pxor	%xmm4, %xmm4
+	ALIGN(16)
+L(tp1):	movdqa	%xmm8, %xmm0
+	pcmpeqd	%xmm1, %xmm0
+	paddd	%xmm9, %xmm1
+	movq	0(tp), %xmm2
+	pand	%xmm0, %xmm2
+	por	%xmm2, %xmm4
+	lea	(tp,n,8), tp
+	add	$-1, i
+	jne	L(tp1)
+	movq	%xmm4, 0(rp)
+
+L(b000):
+IFDOS(`	movdqu	(%rsp), %xmm6	')
+IFDOS(`	movdqu	16(%rsp), %xmm7	')
+IFDOS(`	movdqu	32(%rsp), %xmm8	')
+IFDOS(`	movdqu	48(%rsp), %xmm9	')
+IFDOS(`	add	$88, %rsp	')
+	FUNC_EXIT()
+	ret
+EPILOGUE()
-- 
cgit v1.2.3