From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001
From: Duncan Wilkie <antigravityd@gmail.com>
Date: Sat, 18 Nov 2023 06:11:09 -0600
Subject: Initial commit.

---
 gmp-6.3.0/mpn/arm/neon/lorrshift.asm | 279 +++++++++++++++++++++++++++++++++++
 1 file changed, 279 insertions(+)
 create mode 100644 gmp-6.3.0/mpn/arm/neon/lorrshift.asm

(limited to 'gmp-6.3.0/mpn/arm/neon/lorrshift.asm')

diff --git a/gmp-6.3.0/mpn/arm/neon/lorrshift.asm b/gmp-6.3.0/mpn/arm/neon/lorrshift.asm
new file mode 100644
index 0000000..7ebc780
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/neon/lorrshift.asm
@@ -0,0 +1,279 @@
+dnl  ARM Neon mpn_lshift and mpn_rshift.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb     cycles/limb     cycles/limb      good
+C              aligned	      unaligned	      best seen	     for cpu?
+C StrongARM	 -		 -
+C XScale	 -		 -
+C Cortex-A7	 ?		 ?
+C Cortex-A8	 ?		 ?
+C Cortex-A9	 3		 3				Y
+C Cortex-A15	 1.5		 1.5				Y
+
+
+C We read 64 bits at a time at 32-bit aligned addresses, and except for the
+C first and last store, we write using 64-bit aligned addresses.  All shifting
+C is done on 64-bit words in 'extension' registers.
+C
+C It should be possible to read also using 64-bit alignment, by manipulating
+C the shift count for unaligned operands.  Not done, since it does not seem to
+C matter for A9 or A15.
+C
+C This will not work in big-endian mode.
+
+C TODO
+C  * Try using 128-bit operations.  Note that Neon lacks pure 128-bit shifts,
+C    which might make it tricky.
+C  * Clean up and simplify.
+C  * Consider sharing most of the code for lshift and rshift, since the feed-in
+C    code, the loop, and most of the wind-down code are identical.
+C  * Replace the basecase code with code using 'extension' registers.
+C  * Optimise.  It is not clear that this loop insn permutation is optimal for
+C    either A9 or A15.
+
+C INPUT PARAMETERS
+define(`rp',  `r0')
+define(`ap',  `r1')
+define(`n',   `r2')
+define(`cnt', `r3')
+
+ifdef(`OPERATION_lshift',`
+	define(`IFLSH', `$1')
+	define(`IFRSH', `')
+	define(`X',`0')
+	define(`Y',`1')
+	define(`func',`mpn_lshift')
+')
+ifdef(`OPERATION_rshift',`
+	define(`IFLSH', `')
+	define(`IFRSH', `$1')
+	define(`X',`1')
+	define(`Y',`0')
+	define(`func',`mpn_rshift')
+')
+
+MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
+
+ASM_START(neon)
+	TEXT
+	ALIGN(64)
+PROLOGUE(func)
+IFLSH(`	mov	r12, n, lsl #2	')
+IFLSH(`	add	rp, rp, r12	')
+IFLSH(`	add	ap, ap, r12	')
+
+	cmp	n, #4			C SIMD code n limit
+	ble	L(base)
+
+ifdef(`OPERATION_lshift',`
+	vdup.32	d6, r3			C left shift count is positive
+	sub	r3, r3, #64		C right shift count is negative
+	vdup.32	d7, r3
+	mov	r12, #-8')		C lshift pointer update offset
+ifdef(`OPERATION_rshift',`
+	rsb	r3, r3, #0		C right shift count is negative
+	vdup.32	d6, r3
+	add	r3, r3, #64		C left shift count is positive
+	vdup.32	d7, r3
+	mov	r12, #8')		C rshift pointer update offset
+
+IFLSH(`	sub	ap, ap, #8	')
+	vld1.32	{d19}, [ap], r12	C load initial 2 limbs
+	vshl.u64 d18, d19, d7		C retval
+
+	tst	rp, #4			C is rp 64-bit aligned already?
+	beq	L(rp_aligned)		C yes, skip
+IFLSH(`	add	ap, ap, #4	')	C move back ap pointer
+IFRSH(`	sub	ap, ap, #4	')	C move back ap pointer
+	vshl.u64 d4, d19, d6
+	sub	n, n, #1		C first limb handled
+IFLSH(`	sub	 rp, rp, #4	')
+	vst1.32	 {d4[Y]}, [rp]IFRSH(!)	C store first limb, rp gets aligned
+	vld1.32	 {d19}, [ap], r12	C load ap[1] and ap[2]
+
+L(rp_aligned):
+IFLSH(`	sub	rp, rp, #8	')
+	subs	n, n, #6
+	blt	L(two_or_three_more)
+	tst	n, #2
+	beq	L(2)
+
+L(1):	vld1.32	 {d17}, [ap], r12
+	vshl.u64 d5, d19, d6
+	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	sub	n, n, #2
+	b	 L(mid)
+
+L(2):	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d4, d19, d6
+	vld1.32	 {d17}, [ap], r12
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	subs	n, n, #4
+	blt	L(end)
+
+L(top):	vld1.32	 {d16}, [ap], r12
+	vorr	 d2, d4, d1
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	vst1.32	 {d2}, [rp:64], r12
+L(mid):	vld1.32	 {d17}, [ap], r12
+	vorr	 d3, d5, d0
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	vst1.32	 {d3}, [rp:64], r12
+	subs	n, n, #4
+	bge	L(top)
+
+L(end):	tst	 n, #1
+	beq	 L(evn)
+
+	vorr	 d2, d4, d1
+	vst1.32	 {d2}, [rp:64], r12
+	b	 L(cj1)
+
+L(evn):	vorr	 d2, d4, d1
+	vshl.u64 d0, d17, d7
+	vshl.u64 d16, d17, d6
+	vst1.32	 {d2}, [rp:64], r12
+	vorr	 d2, d5, d0
+	b	 L(cj2)
+
+C Load last 2 - 3 limbs, store last 4 - 5 limbs
+L(two_or_three_more):
+	tst	n, #1
+	beq	L(l2)
+
+L(l3):	vshl.u64 d5, d19, d6
+	vld1.32	 {d17}, [ap], r12
+L(cj1):	veor	 d16, d16, d16
+IFLSH(`	add	 ap, ap, #4	')
+	vld1.32	 {d16[Y]}, [ap], r12
+	vshl.u64 d0, d17, d7
+	vshl.u64 d4, d17, d6
+	vorr	 d3, d5, d0
+	vshl.u64 d1, d16, d7
+	vshl.u64 d5, d16, d6
+	vst1.32	 {d3}, [rp:64], r12
+	vorr	 d2, d4, d1
+	vst1.32	 {d2}, [rp:64], r12
+IFLSH(`	add	 rp, rp, #4	')
+	vst1.32	 {d5[Y]}, [rp]
+	vmov.32	 r0, d18[X]
+	bx	lr
+
+L(l2):	vld1.32	 {d16}, [ap], r12
+	vshl.u64 d4, d19, d6
+	vshl.u64 d1, d16, d7
+	vshl.u64 d16, d16, d6
+	vorr	 d2, d4, d1
+L(cj2):	vst1.32	 {d2}, [rp:64], r12
+	vst1.32	 {d16}, [rp]
+	vmov.32	 r0, d18[X]
+	bx	lr
+
+
+define(`tnc', `r12')
+L(base):
+	push	{r4, r6, r7, r8}
+ifdef(`OPERATION_lshift',`
+	ldr	r4, [ap, #-4]!
+	rsb	tnc, cnt, #32
+
+	mov	r7, r4, lsl cnt
+	tst	n, #1
+	beq	L(ev)			C n even
+
+L(od):	subs	n, n, #2
+	bcc	L(ed1)			C n = 1
+	ldr	r8, [ap, #-4]!
+	b	L(md)			C n = 3
+
+L(ev):	ldr	r6, [ap, #-4]!
+	subs	n, n, #2
+	beq	L(ed)			C n = 3
+					C n = 4
+L(tp):	ldr	r8, [ap, #-4]!
+	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r6, lsl cnt
+L(md):	ldr	r6, [ap, #-4]!
+	orr	r7, r7, r8, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r8, lsl cnt
+
+L(ed):	orr	r7, r7, r6, lsr tnc
+	str	r7, [rp, #-4]!
+	mov	r7, r6, lsl cnt
+L(ed1):	str	r7, [rp, #-4]
+	mov	r0, r4, lsr tnc
+')
+ifdef(`OPERATION_rshift',`
+	ldr	r4, [ap]
+	rsb	tnc, cnt, #32
+
+	mov	r7, r4, lsr cnt
+	tst	n, #1
+	beq	L(ev)			C n even
+
+L(od):	subs	n, n, #2
+	bcc	L(ed1)			C n = 1
+	ldr	r8, [ap, #4]!
+	b	L(md)			C n = 3
+
+L(ev):	ldr	r6, [ap, #4]!
+	subs	n, n, #2
+	beq	L(ed)			C n = 2
+					C n = 4
+
+L(tp):	ldr	r8, [ap, #4]!
+	orr	r7, r7, r6, lsl tnc
+	str	r7, [rp], #4
+	mov	r7, r6, lsr cnt
+L(md):	ldr	r6, [ap, #4]!
+	orr	r7, r7, r8, lsl tnc
+	str	r7, [rp], #4
+	mov	r7, r8, lsr cnt
+
+L(ed):	orr	r7, r7, r6, lsl tnc
+	str	r7, [rp], #4
+	mov	r7, r6, lsr cnt
+L(ed1):	str	r7, [rp], #4
+	mov	r0, r4, lsl tnc
+')
+	pop	{r4, r6, r7, r8}
+	bx	r14
+EPILOGUE()
-- 
cgit v1.2.3