From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001
From: Duncan Wilkie <antigravityd@gmail.com>
Date: Sat, 18 Nov 2023 06:11:09 -0600
Subject: Initial commit.

---
 gmp-6.3.0/mpn/alpha/ev6/mod_1_4.asm | 336 ++++++++++++++++++++++++++++++++++++
 1 file changed, 336 insertions(+)
 create mode 100644 gmp-6.3.0/mpn/alpha/ev6/mod_1_4.asm

(limited to 'gmp-6.3.0/mpn/alpha/ev6/mod_1_4.asm')

diff --git a/gmp-6.3.0/mpn/alpha/ev6/mod_1_4.asm b/gmp-6.3.0/mpn/alpha/ev6/mod_1_4.asm
new file mode 100644
index 0000000..82c42ae
--- /dev/null
+++ b/gmp-6.3.0/mpn/alpha/ev6/mod_1_4.asm
@@ -0,0 +1,336 @@
+dnl Alpha mpn_mod_1s_4p
+
+dnl  Contributed to the GNU project by Torbjorn Granlund.
+
+dnl  Copyright 2009, 2010 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C  * Optimise.  2.75 c/l should be possible.
+C  * Write a proper mpn_mod_1s_4p_cps.  The code below was compiler generated.
+C  * Optimise feed-in code, starting the sw pipeline in switch code.
+C  * Shorten software pipeline.  The mul instructions are scheduled too far
+C    from their users.  Fixing this will allow us to use fewer registers.
+C  * If we cannot reduce register usage, write perhaps small-n basecase.
+C  * Does this work for PIC?
+
+C      cycles/limb
+C EV4:     ?
+C EV5:    23
+C EV6:     3
+
+define(`ap',     `r16')
+define(`n',      `r17')
+define(`pl',     `r24')
+define(`ph',     `r25')
+define(`rl',     `r6')
+define(`rh',     `r7')
+define(`B1modb', `r1')
+define(`B2modb', `r2')
+define(`B3modb', `r3')
+define(`B4modb', `r4')
+define(`B5modb', `r5')
+
+ASM_START()
+PROLOGUE(mpn_mod_1s_4p)
+	lda	r30, -64(r30)
+	stq	r9, 8(r30)
+	ldq	B1modb, 16(r19)
+	stq	r10, 16(r30)
+	ldq	B2modb, 24(r19)
+	stq	r11, 24(r30)
+	ldq	B3modb, 32(r19)
+	stq	r12, 32(r30)
+	ldq	B4modb, 40(r19)
+	stq	r13, 40(r30)
+	ldq	B5modb, 48(r19)
+	s8addq	n, ap, ap		C point ap at vector end
+
+	and	n, 3, r0
+	lda	n, -4(n)
+	beq	r0, L(b0)
+	lda	r6, -2(r0)
+	blt	r6, L(b1)
+	beq	r6, L(b2)
+
+L(b3):	ldq	r21, -16(ap)
+	ldq	r22, -8(ap)
+	ldq	r20, -24(ap)
+	mulq	r21, B1modb, r8
+	umulh	r21, B1modb, r12
+	mulq	r22, B2modb, r9
+	umulh	r22, B2modb, r13
+	addq	r8, r20, pl
+	cmpult	pl, r8, r0
+	addq	r0, r12, ph
+	addq	r9, pl, rl
+	cmpult	rl, r9, r0
+	addq	r13, ph, ph
+	addq	r0, ph, rh
+	lda	ap, -56(ap)
+	br	L(com)
+
+L(b0):	ldq	r21, -24(ap)
+	ldq	r22, -16(ap)
+	ldq	r23, -8(ap)
+	ldq	r20, -32(ap)
+	mulq	r21, B1modb, r8
+	umulh	r21, B1modb, r12
+	mulq	r22, B2modb, r9
+	umulh	r22, B2modb, r13
+	mulq	r23, B3modb, r10
+	umulh	r23, B3modb, r27
+	addq	r8, r20, pl
+	cmpult	pl, r8, r0
+	addq	r0, r12, ph
+	addq	r9, pl, pl
+	cmpult	pl, r9, r0
+	addq	r13, ph, ph
+	addq	r0, ph, ph
+	addq	r10, pl, rl
+	cmpult	rl, r10, r0
+	addq	r27, ph, ph
+	addq	r0, ph, rh
+	lda	ap, -64(ap)
+	br	L(com)
+
+L(b1):	bis	r31, r31, rh
+	ldq	rl, -8(ap)
+	lda	ap, -40(ap)
+	br	L(com)
+
+L(b2):	ldq	rh, -8(ap)
+	ldq	rl, -16(ap)
+	lda	ap, -48(ap)
+
+L(com):	ble	n, L(ed3)
+	ldq	r21, 8(ap)
+	ldq	r22, 16(ap)
+	ldq	r23, 24(ap)
+	ldq	r20, 0(ap)
+	lda	n, -4(n)
+	lda	ap, -32(ap)
+	mulq	r21, B1modb, r8
+	umulh	r21, B1modb, r12
+	mulq	r22, B2modb, r9
+	umulh	r22, B2modb, r13
+	mulq	r23, B3modb, r10
+	umulh	r23, B3modb, r27
+	mulq	rl, B4modb, r11
+	umulh	rl, B4modb, r28
+	ble	n, L(ed2)
+
+	ALIGN(16)
+L(top):	ldq	r21, 8(ap)
+	mulq	rh, B5modb, rl
+	addq	r8, r20, pl
+	ldq	r22, 16(ap)
+	cmpult	pl, r8, r0
+	umulh	rh, B5modb, rh
+	ldq	r23, 24(ap)
+	addq	r0, r12, ph
+	addq	r9, pl, pl
+	mulq	r21, B1modb, r8
+	cmpult	pl, r9, r0
+	addq	r13, ph, ph
+	umulh	r21, B1modb, r12
+	lda	ap, -32(ap)
+	addq	r0, ph, ph
+	addq	r10, pl, pl
+	mulq	r22, B2modb, r9
+	cmpult	pl, r10, r0
+	addq	r27, ph, ph
+	addq	r11, pl, pl
+	umulh	r22, B2modb, r13
+	addq	r0, ph, ph
+	cmpult	pl, r11, r0
+	addq	r28, ph, ph
+	mulq	r23, B3modb, r10
+	ldq	r20, 32(ap)
+	addq	pl, rl, rl
+	umulh	r23, B3modb, r27
+	addq	r0, ph, ph
+	cmpult	rl, pl, r0
+	mulq	rl, B4modb, r11
+	addq	ph, rh, rh
+	umulh	rl, B4modb, r28
+	addq	r0, rh, rh
+	lda	n, -4(n)
+	bgt	n, L(top)
+
+L(ed2):	mulq	rh, B5modb, rl
+	addq	r8, r20, pl
+	umulh	rh, B5modb, rh
+	cmpult	pl, r8, r0
+	addq	r0, r12, ph
+	addq	r9, pl, pl
+	cmpult	pl, r9, r0
+	addq	r13, ph, ph
+	addq	r0, ph, ph
+	addq	r10, pl, pl
+	cmpult	pl, r10, r0
+	addq	r27, ph, ph
+	addq	r11, pl, pl
+	addq	r0, ph, ph
+	cmpult	pl, r11, r0
+	addq	r28, ph, ph
+	addq	pl, rl, rl
+	addq	r0, ph, ph
+	cmpult	rl, pl, r0
+	addq	ph, rh, rh
+	addq	r0, rh, rh
+
+L(ed3):	mulq	rh, B1modb, r8
+	umulh	rh, B1modb, rh
+	addq	r8, rl, rl
+	cmpult	rl, r8, r0
+	addq	r0, rh, rh
+
+	ldq	r24, 8(r19)		C cnt
+	sll	rh, r24, rh
+	subq	r31, r24, r25
+	srl	rl, r25, r2
+	sll	rl, r24, rl
+	or	r2, rh, rh
+
+	ldq	r23, 0(r19)		C bi
+	mulq	rh, r23, r8
+	umulh	rh, r23, r9
+	addq	rh, 1, r7
+	addq	r8, rl, r8		C ql
+	cmpult	r8, rl, r0
+	addq	r9, r7, r9
+	addq	r0, r9, r9		C qh
+	mulq	r9, r18, r21		C qh * b
+	subq	rl, r21, rl
+	cmpult	r8, rl, r0		C rl > ql
+	negq	r0, r0
+	and	r0, r18, r0
+	addq	rl, r0, rl
+	cmpule	r18, rl, r0		C rl >= b
+	negq	r0, r0
+	and	r0, r18, r0
+	subq	rl, r0, rl
+
+	srl	rl, r24, r0
+
+	ldq	r9, 8(r30)
+	ldq	r10, 16(r30)
+	ldq	r11, 24(r30)
+	ldq	r12, 32(r30)
+	ldq	r13, 40(r30)
+	lda	r30, 64(r30)
+	ret	r31, (r26), 1
+EPILOGUE()
+
+PROLOGUE(mpn_mod_1s_4p_cps,gp)
+	lda	r30, -32(r30)
+	stq	r26, 0(r30)
+	stq	r9, 8(r30)
+	stq	r10, 16(r30)
+	stq	r11, 24(r30)
+	mov	r16, r11
+	LEA(	r4, __clz_tab)
+	lda	r10, 65(r31)
+	cmpbge	r31, r17, r1
+	srl	r1, 1, r1
+	xor	r1, 127, r1
+	addq	r1, r4, r1
+	ldq_u	r2, 0(r1)
+	extbl	r2, r1, r2
+	s8subq	r2, 7, r2
+	srl	r17, r2, r3
+	subq	r10, r2, r10
+	addq	r3, r4, r3
+	ldq_u	r1, 0(r3)
+	extbl	r1, r3, r1
+	subq	r10, r1, r10
+	sll	r17, r10, r9
+	mov	r9, r16
+	jsr	r26, mpn_invert_limb
+	LDGP(	r29, 0(r26))
+	subq	r31, r10, r2
+	lda	r1, 1(r31)
+	sll	r1, r10, r1
+	subq	r31, r9, r3
+	srl	r0, r2, r2
+	ldq	r26, 0(r30)
+	bis	r2, r1, r2
+	stq	r0, 0(r11)
+	stq	r10, 8(r11)
+	mulq	r2, r3, r2
+	srl	r2, r10, r3
+	umulh	r2, r0, r1
+	stq	r3, 16(r11)
+	mulq	r2, r0, r3
+	ornot	r31, r1, r1
+	subq	r1, r2, r1
+	mulq	r1, r9, r1
+	addq	r1, r9, r2
+	cmpule	r1, r3, r3
+	cmoveq	r3, r2, r1
+	srl	r1, r10, r3
+	umulh	r1, r0, r2
+	stq	r3, 24(r11)
+	mulq	r1, r0, r3
+	ornot	r31, r2, r2
+	subq	r2, r1, r2
+	mulq	r2, r9, r2
+	addq	r2, r9, r1
+	cmpule	r2, r3, r3
+	cmoveq	r3, r1, r2
+	srl	r2, r10, r1
+	umulh	r2, r0, r3
+	stq	r1, 32(r11)
+	mulq	r2, r0, r1
+	ornot	r31, r3, r3
+	subq	r3, r2, r3
+	mulq	r3, r9, r3
+	addq	r3, r9, r2
+	cmpule	r3, r1, r1
+	cmoveq	r1, r2, r3
+	srl	r3, r10, r2
+	umulh	r3, r0, r1
+	stq	r2, 40(r11)
+	mulq	r3, r0, r0
+	ornot	r31, r1, r1
+	subq	r1, r3, r1
+	mulq	r1, r9, r1
+	addq	r1, r9, r9
+	cmpule	r1, r0, r0
+	cmoveq	r0, r9, r1
+	ldq	r9, 8(r30)
+	srl	r1, r10, r1
+	ldq	r10, 16(r30)
+	stq	r1, 48(r11)
+	ldq	r11, 24(r30)
+	lda	r30, 32(r30)
+	ret	r31, (r26), 1
+EPILOGUE()
-- 
cgit v1.2.3