1 files changed, 182 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/powerpc32/divrem_2.asm b/gmp-6.3.0/mpn/powerpc32/divrem_2.asm
new file mode 100644
index 0000000..74423f4
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc32/divrem_2.asm
@@ -0,0 +1,182 @@
+dnl  PPC-32 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl  Copyright 2007, 2008, 2012, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		cycles/limb
+C		norm	frac
+C 7410		~36.5	~36.5
+C 744x, 745x	 29	 29
+
+C INPUT PARAMETERS
+C qp  = r3
+C fn  = r4
+C up  = r5
+C un  = r6
+C d   = r7
+
+C TODO
+C  * Decrease register usage.
+C  * Make sure mul operands and optimal for early-out.
+C  * Check that things work well for a shared library build.
+C  * Write an invert_limb, perhaps inline, perhaps as a private call.  Or at
+C    least vastly improve the current __udiv_qrnnd_c based code.
+
+
+ASM_START()
+PROLOGUE(mpn_divrem_2)
+	stwu	r1, -32(r1)
+	slwi	r0, r6, 2
+	add	r5, r5, r0
+	stmw	r28, 8(r1)
+	addi	r29, r5, -8		C up = up_param + un - 2
+	lwz	r10, 4(r7)
+	lwz	r12, 4(r29)
+	addi	r8, r3, -12
+	lwz	r7, 0(r7)
+	cmplw	cr7, r12, r10
+	lwz	r28, 0(r29)
+	blt-	cr7, L(2)
+	bgt+	cr7, L(4)
+	cmplw	cr7, r28, r7
+	blt-	cr7, L(2)
+L(4):	subfc	r28, r7, r28
+	subfe	r12, r10, r12
+	li	r3, 1
+	b	L(6)
+L(2):	li	r3, 0
+
+L(6):	add	r0, r4, r6
+	addic.	r30, r0, -2
+	ble-	cr0, L(ret)
+
+	slwi	r9, r0, 2
+	add	r8, r8, r9		C rp += un + fn
+	mtctr	r30
+
+C Compute di from d1
+	srwi	r11, r10, 16
+	nor	r0, r10, r10
+	divwu	r31, r0, r11
+	rlwinm	r5, r10, 0, 16, 31
+	mullw	r9, r11, r31
+	mullw	r6, r5, r31
+	subf	r0, r9, r0
+	slwi	r0, r0, 16
+	ori	r0, r0, 65535
+	cmplw	cr7, r0, r6
+	bge-	cr7, L(9)
+	add	r0, r0, r10
+	cmplw	cr7, r0, r10
+	cmplw	cr6, r6, r0
+	addi	r31, r31, -1		C q1--
+	crorc	28, 28, 25
+	blt+	cr7, L(9)
+	addi	r31, r31, -1		C q1--
+	add	r0, r0, r10
+L(9):	subf	r0, r6, r0
+	divwu	r6, r0, r11
+	mullw	r9, r11, r6
+	mullw	r11, r5, r6
+	subf	r0, r9, r0
+	slwi	r0, r0, 16
+	ori	r0, r0, 65535
+	cmplw	cr7, r0, r11
+	bge-	cr7, L(13)
+	add	r0, r0, r10
+	cmplw	cr7, r0, r10
+	cmplw	cr6, r11, r0
+	addi	r6, r6, -1		C q0--
+	crorc	28, 28, 25
+	blt+	cr7, L(13)
+C	add	r0, r0, r10		C final remainder
+	addi	r6, r6, -1		C q0--
+L(13):	rlwimi	r6, r31, 16, 0, 15	C assemble final quotient
+
+C Adjust di by including d0
+	mullw	r9, r10, r6		C t0 = LO(di * d1)
+	addc	r11, r9, r7
+	subfe	r0, r1, r1
+	mulhwu	r9, r6, r7		C s1 = HI(di * d0)
+	addc	r9, r11, r9
+	addze.	r0, r0
+	blt	cr0, L(17)
+L(18):	subfc	r9, r10, r9
+	addi	r6, r6, -1
+	addme.	r0, r0
+	bge+	cr0, L(18)
+L(17):
+
+C r0  r3  r4  r5  r6  r7  r8  r9 r10 r11 r12 r28 r29 r30 r31
+C     msl         di  d0  qp     d1          fn  up  un
+L(loop):
+	mullw	r0, r12, r6		C q0 = LO(n2 * di)
+	cmpw	cr7, r30, r4
+	addc	r31, r0, r28		C q0 += n1
+	mulhwu	r9, r12, r6		C q  = HI(n2 * di)
+	adde	r12, r9, r12		C q  += n2
+	addi	r30, r30, -1
+	mullw	r0, r10, r12		C d1 * q
+	li	r9, 0
+	subf	r0, r0, r28		C n1 -= d1 * q
+	addi	r5, r12, 1
+	ble-	cr7, L(23)
+	lwzu	r9, -4(r29)
+L(23):	mullw	r11, r12, r7		C t0 = LO(d0 * q)
+	subfc	r28, r7, r9		C n0 -= d0
+	subfe	r0, r10, r0		C n1 -= d1
+	mulhwu	r12, r12, r7		C t1 = HI(d0 * q)
+	subfc	r28, r11, r28		C n0 -= t0
+	subfe	r12, r12, r0		C n1 -= t1
+	cmplw	cr7, r12, r31
+	blt+	cr7, L(24)
+	addc	r28, r28, r7
+	adde	r12, r12, r10
+	addi	r5, r5, -1
+L(24):	cmplw	cr7, r12, r10
+	bge-	cr7, L(fix)
+L(bck):	stw	r5, 0(r8)
+	addi	r8, r8, -4
+	bdnz	L(loop)
+
+L(ret):	stw	r28, 0(r29)
+	stw	r12, 4(r29)
+	lmw	r28, 8(r1)
+	addi	r1, r1, 32
+	blr
+
+L(fix):	cmplw	cr6, r28, r7
+	bgt+	cr7, L(28)
+	blt-	cr6, L(bck)
+L(28):	subfc	r28, r7, r28
+	subfe	r12, r10, r12
+	addi	r5, r5, 1
+	b	L(bck)
+EPILOGUE()