From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001
From: Duncan Wilkie <antigravityd@gmail.com>
Date: Sat, 18 Nov 2023 06:11:09 -0600
Subject: Initial commit.

---
 gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm | 200 +++++++++++++++++++++++++++++++++
 1 file changed, 200 insertions(+)
 create mode 100644 gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm

(limited to 'gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm')

diff --git a/gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm b/gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm
new file mode 100644
index 0000000..85af96f
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/div_qr_2u_pi1.asm
@@ -0,0 +1,200 @@
+dnl  x86-64 mpn_div_qr_2u_pi1
+dnl  -- Divide an mpn number by an unnormalized 2-limb number,
+dnl     using a single-limb inverse and shifting the dividend on the fly.
+
+dnl  Copyright 2007, 2008, 2010, 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		c/l
+C INPUT PARAMETERS
+define(`qp',		`%rdi')
+define(`rp',		`%rsi')
+define(`up_param',	`%rdx')
+define(`un_param',	`%rcx') dnl %rcx needed for shift count
+define(`d1',		`%r8')
+define(`d0',		`%r9')
+define(`shift_param',	`FRAME+8(%rsp)')
+define(`di_param',	`FRAME+16(%rsp)')
+
+define(`di',		`%r10')
+define(`up',		`%r11')
+define(`un',		`%rbp')
+define(`u2',		`%rbx')
+define(`u1',		`%r12')
+define(`u0',		`%rsi') dnl Same as rp, which is saved and restored.
+define(`t1',		`%r13')
+define(`t0',		`%r14')
+define(`md1',		`%r15')
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+deflit(`FRAME', 0)
+PROLOGUE(mpn_div_qr_2u_pi1)
+	mov	di_param, di
+	mov	up_param, up
+	push	%r15
+	push	%r14
+	push	%r13
+	push	%r12
+	push	%rbx
+	push	%rbp
+	push	rp
+deflit(`FRAME', 56)
+	lea	-2(un_param), un
+	mov	d1, md1
+	neg	md1
+
+	C int parameter, 32 bits only
+	movl	shift_param, R32(%rcx)
+
+	C FIXME: Different code for SHLD_SLOW
+
+	xor	R32(u2), R32(u2)
+	mov	8(up, un, 8), u1
+	shld	%cl, u1, u2
+	C Remains to read (up, un, 8) and shift u1, u0
+	C udiv_qr_3by2 (qh,u2,u1,u2,u1,n0, d1,d0,di)
+	mov	di, %rax
+	mul	u2
+	mov	(up, un, 8), u0
+	shld	%cl, u0, u1
+	mov	u1, t0
+	add	%rax, t0	C q0 in t0
+	adc	u2, %rdx
+	mov	%rdx, t1	C q in t1
+	imul	md1, %rdx
+	mov	d0, %rax
+	lea	(%rdx, u1), u2
+	mul	t1
+	mov	u0, u1
+	shl	%cl, u1
+	sub	d0, u1
+	sbb	d1, u2
+	sub	%rax, u1
+	sbb	%rdx, u2
+	xor	R32(%rax), R32(%rax)
+	xor	R32(%rdx), R32(%rdx)
+	cmp	t0, u2
+	cmovnc	d0, %rax
+	cmovnc	d1, %rdx
+	adc	$0, t1
+	nop
+	add	%rax, u1
+	adc	%rdx, u2
+	cmp	d1, u2
+	jae	L(fix_qh)
+L(bck_qh):
+	push	t1	C push qh on stack
+
+	jmp	L(next)
+
+	ALIGN(16)
+L(loop):
+	C udiv_qr_3by2 (q,u2,u1,u2,u1,n0, d1,d0,di)
+	C Based on the optimized divrem_2.asm code.
+
+	mov	di, %rax
+	mul	u2
+	mov	(up, un, 8), u0
+	xor	R32(t1), R32(t1)
+	shld	%cl, u0, t1
+	or	t1, u1
+	mov	u1, t0
+	add	%rax, t0	C q0 in t0
+	adc	u2, %rdx
+	mov	%rdx, t1	C q in t1
+	imul	md1, %rdx
+	mov	d0, %rax
+	lea	(%rdx, u1), u2
+	mul	t1
+	mov	u0, u1
+	shl	%cl, u1
+	sub	d0, u1
+	sbb	d1, u2
+	sub	%rax, u1
+	sbb	%rdx, u2
+	xor	R32(%rax), R32(%rax)
+	xor	R32(%rdx), R32(%rdx)
+	cmp	t0, u2
+	cmovnc	d0, %rax
+	cmovnc	d1, %rdx
+	adc	$0, t1
+	nop
+	add	%rax, u1
+	adc	%rdx, u2
+	cmp	d1, u2
+	jae	L(fix)
+L(bck):
+	mov	t1, (qp, un, 8)
+L(next):
+	sub	$1, un
+	jnc	L(loop)
+L(end):
+	C qh on stack
+	pop	%rax
+	pop	rp
+	shrd	%cl, u2, u1
+	shr	%cl, u2
+	mov	u2, 8(rp)
+	mov	u1, (rp)
+
+	pop	%rbp
+	pop	%rbx
+	pop	%r12
+	pop	%r13
+	pop	%r14
+	pop	%r15
+	ret
+
+L(fix):	C Unlikely update. u2 >= d1
+	seta	%dl
+	cmp	d0, u1
+	setae	%al
+	orb	%dl, %al		C "orb" form to placate Sun tools
+	je	L(bck)
+	inc	t1
+	sub	d0, u1
+	sbb	d1, u2
+	jmp	L(bck)
+
+C Duplicated, just jumping back to a different address.
+L(fix_qh):	C Unlikely update. u2 >= d1
+	seta	%dl
+	cmp	d0, u1
+	setae	%al
+	orb	%dl, %al		C "orb" form to placate Sun tools
+	je	L(bck_qh)
+	inc	t1
+	sub	d0, u1
+	sbb	d1, u2
+	jmp	L(bck_qh)
+EPILOGUE()
-- 
cgit v1.2.3