From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001
From: Duncan Wilkie <antigravityd@gmail.com>
Date: Sat, 18 Nov 2023 06:11:09 -0600
Subject: Initial commit.

---
 gmp-6.3.0/mpn/arm64/popcount.asm | 157 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 157 insertions(+)
 create mode 100644 gmp-6.3.0/mpn/arm64/popcount.asm

(limited to 'gmp-6.3.0/mpn/arm64/popcount.asm')

diff --git a/gmp-6.3.0/mpn/arm64/popcount.asm b/gmp-6.3.0/mpn/arm64/popcount.asm
new file mode 100644
index 0000000..74de3fc
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm64/popcount.asm
@@ -0,0 +1,157 @@
+dnl  ARM64 Neon mpn_popcount -- mpn bit population count.
+
+dnl  Copyright 2013, 2014 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C Cortex-A53	 2.5
+C Cortex-A57	 1.14
+C X-Gene	 3
+
+C TODO
+C  * Consider greater unrolling.
+C  * Arrange to align the pointer, if that helps performance.  Use the same
+C    read-and-mask trick we use on PCs, for simplicity and performance.  (Sorry
+C    valgrind!)
+C  * Explore if explicit align directives, e.g., "[ptr:128]" help.
+C  * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
+
+changecom(blah)
+
+C INPUT PARAMETERS
+define(`ap', x0)
+define(`n',  x1)
+
+C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
+C up with 8 16-bit counters.  Therefore, we can sum to 8(2^16-1) bits, or
+C (8*2^16-1)/64 = 0x1fff limbs.  We use a chunksize close to that, but which
+C  allows the huge count code to jump deep into the code (at L(chu)).
+
+define(`maxsize',  0x1fff)
+define(`chunksize',0x1ff0)
+
+ASM_START()
+PROLOGUE(mpn_popcount)
+
+	mov	x11, #maxsize
+	cmp	n, x11
+	b.hi	L(gt8k)
+
+L(lt8k):
+	movi	v4.16b, #0			C clear summation register
+	movi	v5.16b, #0			C clear summation register
+
+	tbz	n, #0, L(xx0)
+	sub	n, n, #1
+	ld1	{v0.1d}, [ap], #8		C load 1 limb
+	cnt	v6.16b, v0.16b
+	uadalp	v4.8h,  v6.16b			C could also splat
+
+L(xx0):	tbz	n, #1, L(x00)
+	sub	n, n, #2
+	ld1	{v0.2d}, [ap], #16		C load 2 limbs
+	cnt	v6.16b, v0.16b
+	uadalp	v4.8h,  v6.16b
+
+L(x00):	tbz	n, #2, L(000)
+	subs	n, n, #4
+	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
+	b.ls	L(sum)
+
+L(gt4):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
+	sub	n, n, #4
+	cnt	v6.16b, v0.16b
+	cnt	v7.16b, v1.16b
+	b	L(mid)
+
+L(000):	subs	n, n, #8
+	b.lo	L(e0)
+
+L(chu):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
+	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
+	cnt	v6.16b, v2.16b
+	cnt	v7.16b, v3.16b
+	subs	n, n, #8
+	b.lo	L(end)
+
+L(top):	ld1	{v2.2d,v3.2d}, [ap], #32	C load 4 limbs
+	uadalp	v4.8h,  v6.16b
+	cnt	v6.16b, v0.16b
+	uadalp	v5.8h,  v7.16b
+	cnt	v7.16b, v1.16b
+L(mid):	ld1	{v0.2d,v1.2d}, [ap], #32	C load 4 limbs
+	subs	n, n, #8
+	uadalp	v4.8h,  v6.16b
+	cnt	v6.16b, v2.16b
+	uadalp	v5.8h,  v7.16b
+	cnt	v7.16b, v3.16b
+	b.hs	L(top)
+
+L(end):	uadalp	v4.8h,  v6.16b
+	uadalp	v5.8h,  v7.16b
+L(sum):	cnt	v6.16b, v0.16b
+	cnt	v7.16b, v1.16b
+	uadalp	v4.8h,  v6.16b
+	uadalp	v5.8h,  v7.16b
+	add	v4.8h, v4.8h, v5.8h
+					C we have 8 16-bit counts
+L(e0):	uaddlp	v4.4s,  v4.8h		C we have 4 32-bit counts
+	uaddlp	v4.2d,  v4.4s		C we have 2 64-bit counts
+	mov	x0, v4.d[0]
+	mov	x1, v4.d[1]
+	add	x0, x0, x1
+	ret
+
+C Code for count > maxsize.  Splits operand and calls above code.
+define(`ap2', x5)			C caller-saves reg not used above
+L(gt8k):
+	mov	x8, x30
+	mov	x7, n			C full count (caller-saves reg not used above)
+	mov	x4, #0			C total sum  (caller-saves reg not used above)
+	mov	x9, #chunksize*8	C caller-saves reg not used above
+	mov	x10, #chunksize		C caller-saves reg not used above
+
+1:	add	ap2, ap, x9		C point at subsequent block
+	mov	n, #chunksize-8		C count for this invocation, adjusted for entry pt
+	movi	v4.16b, #0		C clear chunk summation register
+	movi	v5.16b, #0		C clear chunk summation register
+	bl	L(chu)			C jump deep inside code
+	add	x4, x4, x0
+	mov	ap, ap2			C put chunk pointer in place for calls
+	sub	x7, x7, x10
+	cmp	x7, x11
+	b.hi	1b
+
+	mov	n, x7			C count for final invocation
+	bl	L(lt8k)
+	add	x0, x4, x0
+	mov	x30, x8
+	ret
+EPILOGUE()
-- 
cgit v1.2.3