From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001
From: Duncan Wilkie <antigravityd@gmail.com>
Date: Sat, 18 Nov 2023 06:11:09 -0600
Subject: Initial commit.

---
 gmp-6.3.0/mpn/x86/k7/mod_1_4.asm | 260 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 260 insertions(+)
 create mode 100644 gmp-6.3.0/mpn/x86/k7/mod_1_4.asm

(limited to 'gmp-6.3.0/mpn/x86/k7/mod_1_4.asm')

diff --git a/gmp-6.3.0/mpn/x86/k7/mod_1_4.asm b/gmp-6.3.0/mpn/x86/k7/mod_1_4.asm
new file mode 100644
index 0000000..bb7597e
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/k7/mod_1_4.asm
@@ -0,0 +1,260 @@
+dnl  x86-32 mpn_mod_1s_4p, requiring cmov.
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C			    cycles/limb
+C P5				 ?
+C P6 model 0-8,10-12		 ?
+C P6 model 9  (Banias)		 ?
+C P6 model 13 (Dothan)		 6
+C P4 model 0  (Willamette)	 ?
+C P4 model 1  (?)		 ?
+C P4 model 2  (Northwood)	15.5
+C P4 model 3  (Prescott)	 ?
+C P4 model 4  (Nocona)		 ?
+C AMD K6			 ?
+C AMD K7			 4.75
+C AMD K8			 ?
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p)
+	push	%ebp
+	push	%edi
+	push	%esi
+	push	%ebx
+	sub	$28, %esp
+	mov	60(%esp), %edi		C cps[]
+	mov	8(%edi), %eax
+	mov	12(%edi), %edx
+	mov	16(%edi), %ecx
+	mov	20(%edi), %esi
+	mov	24(%edi), %edi
+	mov	%eax, 4(%esp)
+	mov	%edx, 8(%esp)
+	mov	%ecx, 12(%esp)
+	mov	%esi, 16(%esp)
+	mov	%edi, 20(%esp)
+	mov	52(%esp), %eax		C n
+	xor	%edi, %edi
+	mov	48(%esp), %esi		C up
+	lea	-12(%esi,%eax,4), %esi
+	and	$3, %eax
+	je	L(b0)
+	cmp	$2, %eax
+	jc	L(b1)
+	je	L(b2)
+
+L(b3):	mov	4(%esi), %eax
+	mull	4(%esp)
+	mov	(%esi), %ebp
+	add	%eax, %ebp
+	adc	%edx, %edi
+	mov	8(%esi), %eax
+	mull	8(%esp)
+	lea	-12(%esi), %esi
+	jmp	L(m0)
+
+L(b0):	mov	(%esi), %eax
+	mull	4(%esp)
+	mov	-4(%esi), %ebp
+	add	%eax, %ebp
+	adc	%edx, %edi
+	mov	4(%esi), %eax
+	mull	8(%esp)
+	add	%eax, %ebp
+	adc	%edx, %edi
+	mov	8(%esi), %eax
+	mull	12(%esp)
+	lea	-16(%esi), %esi
+	jmp	L(m0)
+
+L(b1):	mov	8(%esi), %ebp
+	lea	-4(%esi), %esi
+	jmp	L(m1)
+
+L(b2):	mov	8(%esi), %edi
+	mov	4(%esi), %ebp
+	lea	-8(%esi), %esi
+	jmp	L(m1)
+
+	ALIGN(16)
+L(top):	mov	(%esi), %eax
+	mull	4(%esp)
+	mov	-4(%esi), %ebx
+	xor	%ecx, %ecx
+	add	%eax, %ebx
+	adc	%edx, %ecx
+	mov	4(%esi), %eax
+	mull	8(%esp)
+	add	%eax, %ebx
+	adc	%edx, %ecx
+	mov	8(%esi), %eax
+	mull	12(%esp)
+	add	%eax, %ebx
+	adc	%edx, %ecx
+	lea	-16(%esi), %esi
+	mov	16(%esp), %eax
+	mul	%ebp
+	add	%eax, %ebx
+	adc	%edx, %ecx
+	mov	20(%esp), %eax
+	mul	%edi
+	mov	%ebx, %ebp
+	mov	%ecx, %edi
+L(m0):	add	%eax, %ebp
+	adc	%edx, %edi
+L(m1):	subl	$4, 52(%esp)
+	ja	L(top)
+
+L(end):	mov	4(%esp), %eax
+	mul	%edi
+	mov	60(%esp), %edi
+	add	%eax, %ebp
+	adc	$0, %edx
+	mov	4(%edi), %ecx
+	mov	%edx, %esi
+	mov	%ebp, %eax
+	sal	%cl, %esi
+	mov	%ecx, %ebx
+	neg	%ecx
+	shr	%cl, %eax
+	or	%esi, %eax
+	lea	1(%eax), %esi
+	mull	(%edi)
+	mov	%ebx, %ecx
+	mov	%eax, %ebx
+	mov	%ebp, %eax
+	mov	56(%esp), %ebp
+	sal	%cl, %eax
+	add	%eax, %ebx
+	adc	%esi, %edx
+	imul	%ebp, %edx
+	sub	%edx, %eax
+	lea	(%eax,%ebp), %edx
+	cmp	%eax, %ebx
+	cmovc(	%edx, %eax)
+	mov	%eax, %edx
+	sub	%ebp, %eax
+	cmovc(	%edx, %eax)
+	add	$28, %esp
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	pop	%ebp
+	shr	%cl, %eax
+	ret
+EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_mod_1s_4p_cps)
+C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm
+	push	%ebp
+	push	%edi
+	push	%esi
+	push	%ebx
+	mov	20(%esp), %ebp		C FIXME: avoid bp for 0-idx
+	mov	24(%esp), %ebx
+	bsr	%ebx, %ecx
+	xor	$31, %ecx
+	sal	%cl, %ebx		C b << cnt
+	mov	%ebx, %edx
+	not	%edx
+	mov	$-1, %eax
+	div	%ebx
+	xor	%edi, %edi
+	sub	%ebx, %edi
+	mov	$1, %esi
+	mov	%eax, (%ebp)		C store bi
+	mov	%ecx, 4(%ebp)		C store cnt
+	shld	%cl, %eax, %esi
+	imul	%edi, %esi
+	mov	%eax, %edi
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 8(%ebp)		C store B1modb
+
+	not	%edx
+	imul	%ebx, %edx
+	lea	(%edx,%ebx), %esi
+	cmp	%edx, %eax
+	cmovnc(	%edx, %esi)
+	mov	%edi, %eax
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 12(%ebp)		C store B2modb
+
+	not	%edx
+	imul	%ebx, %edx
+	lea	(%edx,%ebx), %esi
+	cmp	%edx, %eax
+	cmovnc(	%edx, %esi)
+	mov	%edi, %eax
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 16(%ebp)		C store B3modb
+
+	not	%edx
+	imul	%ebx, %edx
+	lea	(%edx,%ebx), %esi
+	cmp	%edx, %eax
+	cmovnc(	%edx, %esi)
+	mov	%edi, %eax
+	mul	%esi
+
+	add	%esi, %edx
+	shr	%cl, %esi
+	mov	%esi, 20(%ebp)		C store B4modb
+
+	not	%edx
+	imul	%ebx, %edx
+	add	%edx, %ebx
+	cmp	%edx, %eax
+	cmovnc(	%edx, %ebx)
+
+	shr	%cl, %ebx
+	mov	%ebx, 24(%ebp)		C store B5modb
+
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	pop	%ebp
+	ret
+EPILOGUE()
-- 
cgit v1.2.3