From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001
From: Duncan Wilkie <antigravityd@gmail.com>
Date: Sat, 18 Nov 2023 06:11:09 -0600
Subject: Initial commit.

---
 gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm | 166 ++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm

(limited to 'gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm')

diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm
new file mode 100644
index 0000000..ee88bab
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/mod_1_1.asm
@@ -0,0 +1,166 @@
+dnl  x86-32 mpn_mod_1_1p for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C  * Optimize.  The present code was written quite straightforwardly.
+C  * Optimize post-loop reduction code; it is from mod_1s_4p, thus overkill.
+C  * Write a cps function that uses sse2 insns.
+
+C                           cycles/limb
+C P6 model 0-8,10-12		-
+C P6 model 9   (Banias)		?
+C P6 model 13  (Dothan)		?
+C P4 model 0-1 (Willamette)	?
+C P4 model 2   (Northwood)     16
+C P4 model 3-4 (Prescott)      18
+
+C INPUT PARAMETERS
+C ap		sp + 4
+C n		sp + 8
+C b		sp + 12
+C cps		sp + 16
+
+define(`B1modb', `%mm1')
+define(`B2modb', `%mm2')
+define(`ap',     `%edx')
+define(`n',      `%eax')
+
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_1_1p)
+	push	%ebx
+	mov	8(%esp), ap
+	mov	12(%esp), n
+	mov	20(%esp), %ecx
+	movd	8(%ecx), B1modb
+	movd	12(%ecx), B2modb
+
+	lea	-4(ap,n,4), ap
+
+C FIXME: See comment in generic/mod_1_1.c.
+	movd	(ap), %mm7
+	movd	-4(ap), %mm4
+	pmuludq B1modb, %mm7
+	paddq	%mm4, %mm7
+	add	$-2, n
+	jz	L(end)
+
+	ALIGN(8)
+L(top):	movq	%mm7, %mm6
+	psrlq	$32, %mm7		C rh
+	movd	-8(ap), %mm0
+	add	$-4, ap
+	pmuludq	B2modb, %mm7
+	pmuludq	B1modb, %mm6
+	add	$-1, n
+	paddq	%mm0, %mm7
+	paddq	%mm6, %mm7
+	jnz	L(top)
+
+L(end):	pcmpeqd	%mm4, %mm4
+	psrlq	$32, %mm4		C 0x00000000FFFFFFFF
+	pand	%mm7, %mm4		C rl
+	psrlq	$32, %mm7		C rh
+	pmuludq	B1modb, %mm7		C rh,cl
+	paddq	%mm4, %mm7		C rh,rl
+	movd	4(%ecx), %mm4		C cnt
+	psllq	%mm4, %mm7		C rh,rl normalized
+	movq	%mm7, %mm2		C rl in low half
+	psrlq	$32, %mm7		C rh
+	movd	(%ecx), %mm1		C bi
+	pmuludq	%mm7, %mm1		C qh,ql
+	paddq	%mm2, %mm1		C qh-1,ql
+	movd	%mm1, %ecx		C ql
+	psrlq	$32, %mm1		C qh-1
+	movd	16(%esp), %mm3		C b
+	pmuludq	%mm1, %mm3		C (qh-1) * b
+	psubq	%mm3, %mm2		C r in low half (could use psubd)
+	movd	%mm2, %eax		C r
+	mov	16(%esp), %ebx
+	sub	%ebx, %eax		C r
+	cmp	%eax, %ecx
+	lea	(%eax,%ebx), %edx
+	cmovc(	%edx, %eax)
+	movd	%mm4, %ecx		C cnt
+	cmp	%ebx, %eax
+	jae	L(fix)
+	emms
+	pop	%ebx
+	shr	%cl, %eax
+	ret
+
+L(fix):	sub	%ebx, %eax
+	emms
+	pop	%ebx
+	shr	%cl, %eax
+	ret
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1_1p_cps)
+C CAUTION: This is the same code as in k7/mod_1_1.asm
+	push	%ebp
+	mov	12(%esp), %ebp
+	push	%esi
+	bsr	%ebp, %ecx
+	push	%ebx
+	xor	$31, %ecx
+	mov	16(%esp), %esi
+	sal	%cl, %ebp
+	mov	%ebp, %edx
+	not	%edx
+	mov	$-1, %eax
+	div	%ebp
+	mov	%eax, (%esi)		C store bi
+	mov	%ecx, 4(%esi)		C store cnt
+	xor	%ebx, %ebx
+	sub	%ebp, %ebx
+	mov	$1, %edx
+	shld	%cl, %eax, %edx
+	imul	%edx, %ebx
+	mul	%ebx
+	add	%ebx, %edx
+	not	%edx
+	imul	%ebp, %edx
+	add	%edx, %ebp
+	cmp	%edx, %eax
+	cmovc(	%ebp, %edx)
+	shr	%cl, %ebx
+	mov	%ebx, 8(%esi)		C store B1modb
+	shr	%cl, %edx
+	mov	%edx, 12(%esi)		C store B2modb
+	pop	%ebx
+	pop	%esi
+	pop	%ebp
+	ret
+EPILOGUE()
-- 
cgit v1.2.3