From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001
From: Duncan Wilkie <antigravityd@gmail.com>
Date: Sat, 18 Nov 2023 06:11:09 -0600
Subject: Initial commit.

---
 gmp-6.3.0/mpn/x86/sqr_basecase.asm | 359 +++++++++++++++++++++++++++++++++++++
 1 file changed, 359 insertions(+)
 create mode 100644 gmp-6.3.0/mpn/x86/sqr_basecase.asm

(limited to 'gmp-6.3.0/mpn/x86/sqr_basecase.asm')

diff --git a/gmp-6.3.0/mpn/x86/sqr_basecase.asm b/gmp-6.3.0/mpn/x86/sqr_basecase.asm
new file mode 100644
index 0000000..39f8a89
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/sqr_basecase.asm
@@ -0,0 +1,359 @@
+dnl  x86 generic mpn_sqr_basecase -- square an mpn number.
+
+dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+
+C     cycles/crossproduct  cycles/triangleproduct
+C P5
+C P6
+C K6
+C K7
+C P4
+
+
+C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
+C lot of function call overheads are avoided, especially when the size is
+C small.
+C
+C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the
+C code size to do so here.
+C
+C Enhancements:
+C
+C The addmul loop here is also not unrolled like aorsmul_1.asm and
+C mul_basecase.asm are.  Perhaps it should be done.  It'd add to the
+C complexity, but if it's worth doing in the other places then it should be
+C worthwhile here.
+C
+C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6)
+C might be worth considering.  That'd add quite a bit to the code size, but
+C only as much as is used would be dragged into L1 cache.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+	TEXT
+	ALIGN(8)
+PROLOGUE(mpn_sqr_basecase)
+deflit(`FRAME',0)
+
+	movl	PARAM_SIZE, %edx
+
+	movl	PARAM_SRC, %eax
+
+	cmpl	$2, %edx
+	movl	PARAM_DST, %ecx
+
+	je	L(two_limbs)
+	ja	L(three_or_more)
+
+
+C -----------------------------------------------------------------------------
+C one limb only
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+	movl	(%eax), %eax
+	mull	%eax
+	movl	%eax, (%ecx)
+	movl	%edx, 4(%ecx)
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(two_limbs):
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx
+
+	pushl	%ebx
+	pushl	%ebp
+
+	movl	%eax, %ebx
+	movl	(%eax), %eax
+
+	mull	%eax		C src[0]^2
+
+	pushl	%esi
+	pushl	%edi
+
+	movl	%edx, %esi	C dst[1]
+	movl	%eax, (%ecx)	C dst[0]
+
+	movl	4(%ebx), %eax
+	mull	%eax		C src[1]^2
+
+	movl	%eax, %edi	C dst[2]
+	movl	%edx, %ebp	C dst[3]
+
+	movl	(%ebx), %eax
+	mull	4(%ebx)		C src[0]*src[1]
+
+	addl	%eax, %esi
+
+	adcl	%edx, %edi
+
+	adcl	$0, %ebp
+	addl	%esi, %eax
+
+	adcl	%edi, %edx
+	movl	%eax, 4(%ecx)
+
+	adcl	$0, %ebp
+
+	movl	%edx, 8(%ecx)
+	movl	%ebp, 12(%ecx)
+
+	popl	%edi
+	popl	%esi
+
+	popl	%ebp
+	popl	%ebx
+
+	ret
+
+
+C -----------------------------------------------------------------------------
+	ALIGN(8)
+L(three_or_more):
+deflit(`FRAME',0)
+	C eax	src
+	C ebx
+	C ecx	dst
+	C edx	size
+
+	pushl	%ebx	FRAME_pushl()
+	pushl	%edi	FRAME_pushl()
+
+	pushl	%esi	FRAME_pushl()
+	pushl	%ebp	FRAME_pushl()
+
+	leal	(%ecx,%edx,4), %edi	C &dst[size], end of this mul1
+	leal	(%eax,%edx,4), %esi	C &src[size]
+
+C First multiply src[0]*src[1..size-1] and store at dst[1..size].
+
+	movl	(%eax), %ebp		C src[0], multiplier
+	movl	%edx, %ecx
+
+	negl	%ecx			C -size
+	xorl	%ebx, %ebx		C clear carry limb
+
+	incl	%ecx			C -(size-1)
+
+L(mul1):
+	C eax	scratch
+	C ebx	carry
+	C ecx	counter, limbs, negative
+	C edx	scratch
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+	mull	%ebp
+	addl	%eax, %ebx
+	adcl	$0, %edx
+	movl	%ebx, (%edi,%ecx,4)
+	movl	%edx, %ebx
+	incl	%ecx
+	jnz	L(mul1)
+
+	movl	%ebx, (%edi)
+
+
+	C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
+	C n=1..size-2.
+	C
+	C The last products src[size-2]*src[size-1], which is the end corner
+	C of the product triangle, is handled separately at the end to save
+	C looping overhead.  If size is 3 then it's only this that needs to
+	C be done.
+	C
+	C In the outer loop %esi is a constant, and %edi just advances by 1
+	C limb each time.  The size of the operation decreases by 1 limb
+	C each time.
+
+	C eax
+	C ebx	carry (needing carry flag added)
+	C ecx
+	C edx
+	C esi	&src[size]
+	C edi	&dst[size]
+	C ebp
+
+	movl	PARAM_SIZE, %ecx
+	subl	$3, %ecx
+	jz	L(corner)
+
+	negl	%ecx
+
+dnl  re-use parameter space
+define(VAR_OUTER,`PARAM_DST')
+
+L(outer):
+	C eax
+	C ebx
+	C ecx
+	C edx	outer loop counter, -(size-3) to -1
+	C esi	&src[size]
+	C edi	dst, pointing at stored carry limb of previous loop
+	C ebp
+
+	movl	%ecx, VAR_OUTER
+	addl	$4, %edi		C advance dst end
+
+	movl	-8(%esi,%ecx,4), %ebp	C next multiplier
+	subl	$1, %ecx
+
+	xorl	%ebx, %ebx		C initial carry limb
+
+L(inner):
+	C eax	scratch
+	C ebx	carry (needing carry flag added)
+	C ecx	counter, -n-1 to -1
+	C edx	scratch
+	C esi	&src[size]
+	C edi	dst end of this addmul
+	C ebp	multiplier
+
+	movl	(%esi,%ecx,4), %eax
+	mull	%ebp
+	addl	%ebx, %eax
+	adcl	$0, %edx
+	addl	%eax, (%edi,%ecx,4)
+	adcl	$0, %edx
+	movl	%edx, %ebx
+	addl	$1, %ecx
+	jl	L(inner)
+
+
+	movl	%ebx, (%edi)
+	movl	VAR_OUTER, %ecx
+	incl	%ecx
+	jnz	L(outer)
+
+
+L(corner):
+	C esi	&src[size]
+	C edi	&dst[2*size-3]
+
+	movl	-4(%esi), %eax
+	mull	-8(%esi)		C src[size-1]*src[size-2]
+	addl	%eax, 0(%edi)
+	adcl	$0, %edx
+	movl	%edx, 4(%edi)		C dst high limb
+
+
+C -----------------------------------------------------------------------------
+C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
+
+	movl	PARAM_SIZE, %eax
+	negl	%eax
+	addl	$1, %eax		C -(size-1) and clear carry
+
+L(lshift):
+	C eax	counter, negative
+	C ebx	next limb
+	C ecx
+	C edx
+	C esi
+	C edi	&dst[2*size-4]
+	C ebp
+
+	rcll	8(%edi,%eax,8)
+	rcll	12(%edi,%eax,8)
+	incl	%eax
+	jnz	L(lshift)
+
+
+	adcl	%eax, %eax		C high bit out
+	movl	%eax, 8(%edi)		C dst most significant limb
+
+
+C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
+C src[size-1]^2.  dst[0] hasn't yet been set at all yet, and just gets the
+C low limb of src[0]^2.
+
+	movl	PARAM_SRC, %esi
+	movl	(%esi), %eax		C src[0]
+	mull	%eax			C src[0]^2
+
+	movl	PARAM_SIZE, %ecx
+	leal	(%esi,%ecx,4), %esi	C src end
+
+	negl	%ecx			C -size
+	movl	%edx, %ebx		C initial carry
+
+	movl	%eax, 12(%edi,%ecx,8)	C dst[0]
+	incl	%ecx			C -(size-1)
+
+L(diag):
+	C eax	scratch (low product)
+	C ebx	carry limb
+	C ecx	counter, -(size-1) to -1
+	C edx	scratch (high product)
+	C esi	&src[size]
+	C edi	&dst[2*size-3]
+	C ebp	scratch (fetched dst limbs)
+
+	movl	(%esi,%ecx,4), %eax
+	mull	%eax
+
+	addl	%ebx, 8(%edi,%ecx,8)
+	movl	%edx, %ebx
+
+	adcl	%eax, 12(%edi,%ecx,8)
+	adcl	$0, %ebx
+
+	incl	%ecx
+	jnz	L(diag)
+
+
+	addl	%ebx, 8(%edi)		C dst most significant limb
+
+	popl	%ebp
+	popl	%esi
+
+	popl	%edi
+	popl	%ebx
+
+	ret
+
+EPILOGUE()
-- 
cgit v1.2.3