From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001
From: Duncan Wilkie <antigravityd@gmail.com>
Date: Sat, 18 Nov 2023 06:11:09 -0600
Subject: Initial commit.

---
 gmp-6.3.0/mpn/arm/v7a/cora15/addmul_1.asm          | 145 ++++++++++++
 gmp-6.3.0/mpn/arm/v7a/cora15/aors_n.asm            | 162 +++++++++++++
 gmp-6.3.0/mpn/arm/v7a/cora15/bdiv_q_1.asm          |  36 +++
 gmp-6.3.0/mpn/arm/v7a/cora15/cnd_aors_n.asm        | 158 +++++++++++++
 gmp-6.3.0/mpn/arm/v7a/cora15/com.asm               | 180 +++++++++++++++
 gmp-6.3.0/mpn/arm/v7a/cora15/gmp-mparam.h          | 212 +++++++++++++++++
 gmp-6.3.0/mpn/arm/v7a/cora15/logops_n.asm          | 253 +++++++++++++++++++++
 gmp-6.3.0/mpn/arm/v7a/cora15/mul_1.asm             | 104 +++++++++
 .../mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm      |  43 ++++
 .../mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm      |  43 ++++
 .../mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm      | 144 ++++++++++++
 gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm          |  97 ++++++++
 gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm        | 110 +++++++++
 gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm        |  90 ++++++++
 gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm   | 177 ++++++++++++++
 gmp-6.3.0/mpn/arm/v7a/cora15/submul_1.asm          | 159 +++++++++++++
 16 files changed, 2113 insertions(+)
 create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/addmul_1.asm
 create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/aors_n.asm
 create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/bdiv_q_1.asm
 create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/cnd_aors_n.asm
 create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/com.asm
 create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/gmp-mparam.h
 create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/logops_n.asm
 create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/mul_1.asm
 create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm
 create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm
 create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm
 create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm
 create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm
 create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm
 create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm
 create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/submul_1.asm

(limited to 'gmp-6.3.0/mpn/arm/v7a/cora15')

diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/addmul_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/addmul_1.asm
new file mode 100644
index 0000000..c2277b3
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/addmul_1.asm
@@ -0,0 +1,145 @@
+dnl  ARM mpn_addmul_1 optimised for A15.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb		best
+C StrongARM:     -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 6			3.25
+C Cortex-A15	 2			this
+
+C This code uses umlal for adding in the rp[] data, keeping the recurrency path
+C separate from any multiply instructions.  It performs well on A15, at umlal's
+C bandwidth.
+C
+C An A9 variant should perhaps stick to 3-way unrolling, and use ldm and stm
+C for all loads and stores.  Alternatively, it could do 2-way or 4-way, but
+C then alignment aware code will be necessary (adding O(1) bookkeeping
+C overhead).
+C
+C We don't use r12 due to ldrd and strd limitations.
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+define(`v0', `r3')
+
+define(`w0', `r10') define(`w1', `r11')
+define(`u0', `r8')  define(`u1', `r9')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+	push	{ r4-r11 }
+
+	ands	r6, n, #3
+	sub	n, n, #3
+	beq	L(b00)
+	cmp	r6, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	mov	r6, #0
+	cmn	r13, #0			C carry clear
+	ldr	u1, [up], #-4
+	ldr	w1, [rp], #-4
+	mov	r7, #0
+	b	L(mid)
+
+L(b00):	ldrd	u0, u1, [up]
+	ldrd	w0, w1, [rp]
+	mov	r6, #0
+	umlal	w0, r6, u0, v0
+	cmn	r13, #0			C carry clear
+	mov	r7, #0
+	str	w0, [rp]
+	b	L(mid)
+
+L(b10):	ldrd	u0, u1, [up], #8
+	ldrd	w0, w1, [rp]
+	mov	r4, #0
+	umlal	w0, r4, u0, v0
+	cmn	r13, #0			C carry clear
+	mov	r5, #0
+	str	w0, [rp], #8
+	umlal	w1, r5, u1, v0
+	tst	n, n
+	bmi	L(end)
+	b	L(top)
+
+L(b01):	mov	r4, #0
+	ldr	u1, [up], #4
+	ldr	w1, [rp], #4
+	mov	r5, #0
+	umlal	w1, r5, u1, v0
+	tst	n, n
+	bmi	L(end)
+
+	ALIGN(16)
+L(top):	ldrd	u0, u1, [up, #0]
+	adcs	r4, r4, w1
+	ldrd	w0, w1, [rp, #0]
+	mov	r6, #0
+	umlal	w0, r6, u0, v0		C 1 2
+	adcs	r5, r5, w0
+	mov	r7, #0
+	strd	r4, r5, [rp, #-4]
+L(mid):	umlal	w1, r7, u1, v0		C 2 3
+	ldrd	u0, u1, [up, #8]
+	adcs	r6, r6, w1
+	ldrd	w0, w1, [rp, #8]
+	mov	r4, #0
+	umlal	w0, r4, u0, v0		C 3 4
+	adcs	r7, r7, w0
+	mov	r5, #0
+	strd	r6, r7, [rp, #4]
+	umlal	w1, r5, u1, v0		C 0 1
+	sub	n, n, #4
+	add	up, up, #16
+	add	rp, rp, #16
+	tst	n, n
+	bpl	L(top)
+
+L(end):	adcs	r4, r4, w1
+	str	r4, [rp, #-4]
+	adc	r0, r5, #0
+	pop	{ r4-r11 }
+	bx	r14
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/aors_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/aors_n.asm
new file mode 100644
index 0000000..dc3f839
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/aors_n.asm
@@ -0,0 +1,162 @@
+dnl  ARM mpn_add_n/mpn_sub_n optimised for A15.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb		best
+C StrongARM:     -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 3.55			2.5
+C Cortex-A15	 1.27			this
+
+C This was a major improvement compared to the code we had before, but it might
+C not be the best 8-way code possible.  We've tried some permutations of auto-
+C increments and separate pointer updates, but they all ran at the same speed
+C on A15.
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+ifdef(`OPERATION_add_n', `
+  define(`ADDSUBC',	adcs)
+  define(`IFADD',	`$1')
+  define(`SETCY',	`cmp	$1, #1')
+  define(`RETVAL',	`adc	r0, n, #0')
+  define(`RETVAL2',	`adc	r0, n, #1')
+  define(`func',	mpn_add_n)
+  define(`func_nc',	mpn_add_nc)')
+ifdef(`OPERATION_sub_n', `
+  define(`ADDSUBC',	sbcs)
+  define(`IFADD',	`')
+  define(`SETCY',	`rsbs	$1, $1, #0')
+  define(`RETVAL',	`sbc	r0, r0, r0
+			and	r0, r0, #1')
+  define(`RETVAL2',	`RETVAL')
+  define(`func',	mpn_sub_n)
+  define(`func_nc',	mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+	ldr	r12, [sp]
+	b	L(ent)
+EPILOGUE()
+PROLOGUE(func)
+	mov	r12, #0
+L(ent):	push	{ r4-r9 }
+
+	ands	r6, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00)
+	cmp	r6, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	ldr	r5, [up], #4
+	ldr	r7, [vp], #4
+	SETCY(	r12)
+	ADDSUBC	r9, r5, r7
+	ldrd	r4, r5, [up, #0]
+	ldrd	r6, r7, [vp, #0]
+	str	r9, [rp], #-4
+	b	L(lo)
+
+L(b00):	ldrd	r4, r5, [up], #-8
+	ldrd	r6, r7, [vp], #-8
+	SETCY(	r12)
+	sub	rp, rp, #16
+	b	L(mid)
+
+L(b01):	ldr	r5, [up], #-4
+	ldr	r7, [vp], #-4
+	SETCY(	r12)
+	ADDSUBC	r9, r5, r7
+	str	r9, [rp], #-12
+	tst	n, n
+	beq	L(wd1)
+L(gt1):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	b	L(mid)
+
+L(b10):	ldrd	r4, r5, [up]
+	ldrd	r6, r7, [vp]
+	SETCY(	r12)
+	sub	rp, rp, #8
+	b	L(lo)
+
+	ALIGN(16)
+L(top):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	strd	r8, r9, [rp, #8]
+L(mid):	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	ldrd	r4, r5, [up, #16]
+	ldrd	r6, r7, [vp, #16]
+	strd	r8, r9, [rp, #16]
+	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	sub	n, n, #2
+	tst	n, n
+	bmi	L(dne)
+	ldrd	r4, r5, [up, #24]
+	ldrd	r6, r7, [vp, #24]
+	strd	r8, r9, [rp, #24]
+	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	ldrd	r4, r5, [up, #32]!
+	ldrd	r6, r7, [vp, #32]!
+	strd	r8, r9, [rp, #32]!
+L(lo):	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	tst	n, n
+	bne	L(top)
+
+L(end):	strd	r8, r9, [rp, #8]
+L(wd1):	RETVAL
+	pop	{ r4-r9 }
+	bx	r14
+L(dne):	strd	r8, r9, [rp, #24]
+	RETVAL2
+	pop	{ r4-r9 }
+	bx	r14
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/bdiv_q_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/bdiv_q_1.asm
new file mode 100644
index 0000000..245b371
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/bdiv_q_1.asm
@@ -0,0 +1,36 @@
+dnl  ARM mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2012, 2013, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+include_mpn(`arm/v7a/cora8/bdiv_q_1.asm')
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/cnd_aors_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/cnd_aors_n.asm
new file mode 100644
index 0000000..b9e5cd3
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/cnd_aors_n.asm
@@ -0,0 +1,158 @@
+dnl  ARM mpn_cnd_add_n/mpn_cnd_sub_n optimised for A15.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb		best
+C StrongARM:     -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 3.75			 3
+C Cortex-A15	 1.78			this
+
+C This code does not run as well as one could have hoped, since 1.5 c/l seems
+C realistic for this insn mix.
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`cnd',`r0')
+define(`rp', `r1')
+define(`up', `r2')
+define(`vp', `r3')
+define(`n',  `r12')
+
+ifdef(`OPERATION_cnd_add_n', `
+  define(`ADDSUB',	adds)
+  define(`ADDSUBC',	adcs)
+  define(`IFADD',	`$1')
+  define(`INITCY',      `cmn	r0, #0')
+  define(`RETVAL',	`adc	r0, n, #0')
+  define(`RETVAL2',	`adc	r0, n, #1')
+  define(`func',	mpn_cnd_add_n)
+  define(`func_nc',	mpn_add_nc)')
+ifdef(`OPERATION_cnd_sub_n', `
+  define(`ADDSUB',	subs)
+  define(`ADDSUBC',	sbcs)
+  define(`IFADD',	`')
+  define(`INITCY',      `cmp	r0, #0')
+  define(`RETVAL',	`sbc	r0, r0, r0
+			and	r0, r0, #1')
+  define(`RETVAL2',	`RETVAL')
+  define(`func',	mpn_cnd_sub_n)
+  define(`func_nc',	mpn_sub_nc)')
+
+MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	ldr	n, [sp]
+	push	{ r4-r9 }
+
+	cmp	cnd, #1
+	sbc	cnd, cnd, cnd		C conditionally set to 0xffffffff
+
+	ands	r6, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00)
+	cmp	r6, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	ldr	r5, [up], #4
+	ldr	r7, [vp], #4
+	bic	r7, r7, cnd
+	ADDSUB	r9, r5, r7
+	ldrd	r4, r5, [up, #0]
+	ldrd	r6, r7, [vp, #0]
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	str	r9, [rp], #-4
+	b	L(lo)
+
+L(b00):	ldrd	r4, r5, [up], #-8
+	ldrd	r6, r7, [vp], #-8
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	INITCY
+	sub	rp, rp, #16
+	b	L(mid)
+
+L(b01):	ldr	r5, [up], #-4
+	ldr	r7, [vp], #-4
+	bic	r7, r7, cnd
+	ADDSUB	r9, r5, r7
+	str	r9, [rp], #-12
+	tst	n, n
+	beq	L(wd1)
+L(gt1):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	b	L(mid)
+
+L(b10):	ldrd	r4, r5, [up]
+	ldrd	r6, r7, [vp]
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	INITCY
+	sub	rp, rp, #8
+	b	L(lo)
+
+	ALIGN(16)
+L(top):	ldrd	r6, r7, [vp, #8]
+	ldrd	r4, r5, [up, #8]
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	strd	r8, r9, [rp, #8]
+L(mid):	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	ldrd	r6, r7, [vp, #16]!
+	ldrd	r4, r5, [up, #16]!
+	bic	r6, r6, cnd
+	bic	r7, r7, cnd
+	sub	n, n, #1
+	strd	r8, r9, [rp, #16]!
+L(lo):	ADDSUBC	r8, r4, r6
+	ADDSUBC	r9, r5, r7
+	tst	n, n
+	bne	L(top)
+
+L(end):	strd	r8, r9, [rp, #8]
+L(wd1):	RETVAL
+	pop	{ r4-r9 }
+	bx	r14
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/com.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/com.asm
new file mode 100644
index 0000000..a258afe
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/com.asm
@@ -0,0 +1,180 @@
+dnl  ARM mpn_com optimised for A15.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	2.5
+C Cortex-A15	1.0
+
+C This is great A15 core register code, but it is a bit large.
+C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling.
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`FEEDIN_VARIANT', 1)	C alternatives: 0 1 2
+define(`UNROLL', 4x2)		C alternatives: 4 4x2
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+	push	{ r4-r5,r8-r9 }
+
+ifelse(FEEDIN_VARIANT,0,`
+	ands	r12, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00a)
+	tst	r12, #1
+	beq	L(bx0)
+	ldr	r5, [up], #4
+	mvn	r9, r5
+	str	r9, [rp], #4
+	tst	r12, #2
+	beq	L(b00)
+L(bx0):	ldrd	r4, r5, [up, #0]
+	sub	rp, rp, #8
+	b	L(lo)
+L(b00):	tst	n, n
+	beq	L(wd1)
+L(b00a):ldrd	r4, r5, [up], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+')
+ifelse(FEEDIN_VARIANT,1,`
+	and	r12, n, #3
+	mov	n, n, lsr #2
+	tst	r12, #1
+	beq	L(bx0)
+	ldr	r5, [up], #4
+	mvn	r9, r5
+	str	r9, [rp], #4
+L(bx0):	tst	r12, #2
+	beq	L(b00)
+	ldrd	r4, r5, [up, #0]
+	sub	rp, rp, #8
+	b	L(lo)
+L(b00):	tst	n, n
+	beq	L(wd1)
+	ldrd	r4, r5, [up], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+')
+ifelse(FEEDIN_VARIANT,2,`
+	ands	r12, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00)
+	cmp	r12, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	ldr	r5, [up], #4
+	mvn	r9, r5
+	ldrd	r4, r5, [up, #0]
+	str	r9, [rp], #-4
+	b	L(lo)
+
+L(b00):	ldrd	r4, r5, [up], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+
+L(b01):	ldr	r5, [up], #-4
+	mvn	r9, r5
+	str	r9, [rp], #-12
+	tst	n, n
+	beq	L(wd1)
+L(gt1):	ldrd	r4, r5, [up, #8]
+	b	L(mid)
+
+L(b10):	ldrd	r4, r5, [up]
+	sub	rp, rp, #8
+	b	L(lo)
+')
+	ALIGN(16)
+ifelse(UNROLL,4,`
+L(top):	ldrd	r4, r5, [up, #8]
+	strd	r8, r9, [rp, #8]
+L(mid):	mvn	r8, r4
+	mvn	r9, r5
+	ldrd	r4, r5, [up, #16]!
+	strd	r8, r9, [rp, #16]!
+	sub	n, n, #1
+L(lo):	mvn	r8, r4
+	mvn	r9, r5
+	tst	n, n
+	bne	L(top)
+')
+ifelse(UNROLL,4x2,`
+L(top):	ldrd	r4, r5, [up, #8]
+	strd	r8, r9, [rp, #8]
+L(mid):	mvn	r8, r4
+	mvn	r9, r5
+	ldrd	r4, r5, [up, #16]
+	strd	r8, r9, [rp, #16]
+	mvn	r8, r4
+	mvn	r9, r5
+	sub	n, n, #2
+	tst	n, n
+	bmi	L(dne)
+	ldrd	r4, r5, [up, #24]
+	strd	r8, r9, [rp, #24]
+	mvn	r8, r4
+	mvn	r9, r5
+	ldrd	r4, r5, [up, #32]!
+	strd	r8, r9, [rp, #32]!
+L(lo):	mvn	r8, r4
+	mvn	r9, r5
+	tst	n, n
+	bne	L(top)
+')
+
+L(end):	strd	r8, r9, [rp, #8]
+L(wd1):	pop	{ r4-r5,r8-r9 }
+	bx	r14
+ifelse(UNROLL,4x2,`
+L(dne):	strd	r8, r9, [rp, #24]
+	pop	{ r4-r5,r8-r9 }
+	bx	r14
+')
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/gmp-mparam.h b/gmp-6.3.0/mpn/arm/v7a/cora15/gmp-mparam.h
new file mode 100644
index 0000000..409cbbb
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/gmp-mparam.h
@@ -0,0 +1,212 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 2000 MHz Cortex-A15 with Neon (in spite of file position) */
+/* FFT tuning limit = 50,736,668 */
+/* Generated by tuneup.c, 2019-10-22, gcc 5.4 */
+
+#define MOD_1_NORM_THRESHOLD                 0  /* always */
+#define MOD_1_UNNORM_THRESHOLD               0  /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD          3
+#define MOD_1U_TO_MOD_1_1_THRESHOLD          2
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD        10
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD     MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD      8
+#define USE_PREINV_DIVREM_1                  1  /* native */
+#define DIV_QR_1N_PI1_METHOD                 1  /* 49.14% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD          MP_SIZE_T_MAX  /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD        MP_SIZE_T_MAX  /* never */
+#define DIV_QR_2_PI2_THRESHOLD           MP_SIZE_T_MAX  /* never */
+#define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD           17
+
+#define DIV_1_VS_MUL_1_PERCENT             267
+
+#define MUL_TOOM22_THRESHOLD                28
+#define MUL_TOOM33_THRESHOLD               114
+#define MUL_TOOM44_THRESHOLD               178
+#define MUL_TOOM6H_THRESHOLD               238
+#define MUL_TOOM8H_THRESHOLD               597
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD     113
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD     115
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD     115
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD     115
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD     154
+
+#define SQR_BASECASE_THRESHOLD               0  /* always (native) */
+#define SQR_TOOM2_THRESHOLD                 38
+#define SQR_TOOM3_THRESHOLD                126
+#define SQR_TOOM4_THRESHOLD                336
+#define SQR_TOOM6_THRESHOLD                446
+#define SQR_TOOM8_THRESHOLD                650
+
+#define MULMID_TOOM42_THRESHOLD             52
+
+#define MULMOD_BNM1_THRESHOLD               23
+#define SQRMOD_BNM1_THRESHOLD               17
+
+#define MUL_FFT_MODF_THRESHOLD             575  /* k = 5 */
+#define MUL_FFT_TABLE3                                      \
+  { {    575, 5}, {     25, 6}, {     13, 5}, {     27, 6}, \
+    {     27, 7}, {     15, 6}, {     31, 7}, {     19, 6}, \
+    {     39, 7}, {     25, 6}, {     51, 7}, {     27, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     51, 8}, {     27, 9}, {     15, 8}, \
+    {     31, 7}, {     63, 8}, {     39, 9}, {     23, 8}, \
+    {     55,10}, {     15, 9}, {     31, 8}, {     67, 9}, \
+    {     39, 8}, {     79, 9}, {     47, 8}, {     95, 9}, \
+    {     55,10}, {     31, 9}, {     71, 8}, {    143, 9}, \
+    {     87,10}, {     47, 9}, {    111,11}, {     31,10}, \
+    {     63, 9}, {    143,10}, {     79, 9}, {    159,10}, \
+    {     95,11}, {     63,10}, {    143, 9}, {    287,10}, \
+    {    159,11}, {     95,10}, {    191,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    335, 9}, \
+    {    671,10}, {    367, 9}, {    735,11}, {    191,10}, \
+    {    383, 9}, {    799,10}, {    415,11}, {    223,12}, \
+    {    127,10}, {    543,11}, {    287,10}, {    575,11}, \
+    {    319,10}, {    639,11}, {    351,10}, {    703,12}, \
+    {    191,11}, {    383,10}, {    767,11}, {    415,10}, \
+    {    831,11}, {    447,13}, {    127,12}, {    255,11}, \
+    {    543,10}, {   1087,11}, {    575,12}, {    319,11}, \
+    {    671,10}, {   1343,11}, {    735,12}, {    383,11}, \
+    {    831,12}, {    447,11}, {    959,13}, {    255,12}, \
+    {    511,11}, {   1087,12}, {    575,11}, {   1151,12}, \
+    {    639,11}, {   1343,12}, {    703,13}, {    383,12}, \
+    {    767,11}, {   1599,12}, {    895,14}, {    255,13}, \
+    {    511,12}, {   1087,13}, {    639,12}, {   1407,13}, \
+    {    767,12}, {   1599,13}, {    895,14}, {    511,13}, \
+    {   1023,12}, {   2111,13}, {   1151,12}, {   2431,13}, \
+    {   1279,12}, {   2559,13}, {   1407,14}, {    767,13}, \
+    {   1535,12}, {   3135,13}, {   1663,15}, {    511,14}, \
+    {   1023,13}, {   2303,14}, {   1279,13}, {   2559,12}, \
+    {   5119,13}, {   2687,14}, {   1535,13}, {   3071,12}, \
+    {   6143,13}, {   3199,12}, {   6399,14}, {   1791,15}, \
+    {   1023,14}, {   2047,13}, {   4095,14}, {   2303,13}, \
+    {   4607,12}, {   9215,13}, {   4863,12}, {   9727,14}, \
+    {   2559,13}, {   5119,15}, {   1535,14}, {   3071,13}, \
+    {   8192,14}, {  16384,15}, {  32768,16} }
+#define MUL_FFT_TABLE3_SIZE 155
+#define MUL_FFT_THRESHOLD                 5760
+
+#define SQR_FFT_MODF_THRESHOLD             525  /* k = 5 */
+#define SQR_FFT_TABLE3                                      \
+  { {    525, 5}, {     25, 6}, {     27, 7}, {     15, 6}, \
+    {     31, 7}, {     17, 6}, {     35, 7}, {     19, 6}, \
+    {     39, 7}, {     25, 6}, {     51, 7}, {     27, 8}, \
+    {     15, 7}, {     33, 8}, {     19, 7}, {     41, 8}, \
+    {     23, 7}, {     51, 8}, {     27, 7}, {     55, 9}, \
+    {     15, 8}, {     31, 7}, {     63, 8}, {     39, 9}, \
+    {     23, 8}, {     51,10}, {     15, 9}, {     31, 8}, \
+    {     67, 9}, {     39, 8}, {     79, 9}, {     47, 8}, \
+    {     99, 9}, {     55,10}, {     31, 9}, {     79,10}, \
+    {     47, 9}, {     95,11}, {     31,10}, {     63, 9}, \
+    {    135,10}, {     95,11}, {     63,10}, {    143, 9}, \
+    {    287, 8}, {    575, 9}, {    303,10}, {    159,11}, \
+    {     95,10}, {    191, 9}, {    383,12}, {     63,11}, \
+    {    127,10}, {    255, 9}, {    511,10}, {    271, 9}, \
+    {    543,10}, {    287,11}, {    159,10}, {    319, 9}, \
+    {    639,10}, {    335, 9}, {    671,10}, {    351,11}, \
+    {    191,10}, {    399, 9}, {    799,10}, {    415, 9}, \
+    {    831,11}, {    223,10}, {    447,12}, {    127,10}, \
+    {    543,11}, {    287,10}, {    575,11}, {    319,10}, \
+    {    639,11}, {    351,10}, {    703,12}, {    191,11}, \
+    {    383,10}, {    799,11}, {    415,10}, {    831,11}, \
+    {    447,13}, {    127,11}, {    543,10}, {   1087,11}, \
+    {    607,12}, {    319,11}, {    735,12}, {    383,11}, \
+    {    831,12}, {    447,11}, {    959,12}, {    511,11}, \
+    {   1023,12}, {    575,11}, {   1151,12}, {    639,11}, \
+    {   1279,12}, {    703,13}, {    383,12}, {    767,11}, \
+    {   1535,12}, {    831,11}, {   1663,12}, {    895,14}, \
+    {    255,13}, {    511,12}, {   1087,13}, {    639,12}, \
+    {   1343,13}, {    767,12}, {   1599,13}, {    895,14}, \
+    {    511,13}, {   1023,12}, {   2111,13}, {   1151,12}, \
+    {   2303,13}, {   1279,14}, {    767,13}, {   1535,12}, \
+    {   3135,13}, {   1663,15}, {    511,14}, {   1023,13}, \
+    {   2047,12}, {   4095,13}, {   2303,14}, {   1279,13}, \
+    {   2559,12}, {   5119,14}, {   1535,13}, {   3071,12}, \
+    {   6143,13}, {   3199,12}, {   6399,14}, {   1791,15}, \
+    {   1023,14}, {   2047,13}, {   4095,14}, {   2303,13}, \
+    {   4607,12}, {   9215,13}, {   4863,12}, {   9727,14}, \
+    {   2559,15}, {   1535,14}, {   3071,13}, {   8192,14}, \
+    {  16384,15}, {  32768,16} }
+#define SQR_FFT_TABLE3_SIZE 154
+#define SQR_FFT_THRESHOLD                 5312
+
+#define MULLO_BASECASE_THRESHOLD             0  /* always */
+#define MULLO_DC_THRESHOLD                  38
+#define MULLO_MUL_N_THRESHOLD            10950
+#define SQRLO_BASECASE_THRESHOLD            10
+#define SQRLO_DC_THRESHOLD                  35
+#define SQRLO_SQR_THRESHOLD              10323
+
+#define DC_DIV_QR_THRESHOLD                 57
+#define DC_DIVAPPR_Q_THRESHOLD             254
+#define DC_BDIV_QR_THRESHOLD                48
+#define DC_BDIV_Q_THRESHOLD                286
+
+#define INV_MULMOD_BNM1_THRESHOLD           55
+#define INV_NEWTON_THRESHOLD               252
+#define INV_APPR_THRESHOLD                 252
+
+#define BINV_NEWTON_THRESHOLD              372
+#define REDC_1_TO_REDC_2_THRESHOLD          61
+#define REDC_2_TO_REDC_N_THRESHOLD           0  /* always */
+
+#define MU_DIV_QR_THRESHOLD               1858
+#define MU_DIVAPPR_Q_THRESHOLD            1787
+#define MUPI_DIV_QR_THRESHOLD              122
+#define MU_BDIV_QR_THRESHOLD              1528
+#define MU_BDIV_Q_THRESHOLD               1836
+
+#define POWM_SEC_TABLE  1,14,200,480,1532
+
+#define GET_STR_DC_THRESHOLD                16
+#define GET_STR_PRECOMPUTE_THRESHOLD        33
+#define SET_STR_DC_THRESHOLD               104
+#define SET_STR_PRECOMPUTE_THRESHOLD      1120
+
+#define FAC_DSC_THRESHOLD                  164
+#define FAC_ODD_THRESHOLD                   27
+
+#define MATRIX22_STRASSEN_THRESHOLD         19
+#define HGCD2_DIV1_METHOD                    1  /* 3.70% faster than 3 */
+#define HGCD_THRESHOLD                     137
+#define HGCD_APPR_THRESHOLD                157
+#define HGCD_REDUCE_THRESHOLD             3389
+#define GCD_DC_THRESHOLD                   610
+#define GCDEXT_DC_THRESHOLD                443
+#define JACOBI_BASE_METHOD                   4  /* 12.66% faster than 1 */
+
+/* Tuneup completed successfully, took 69757 seconds */
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/logops_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/logops_n.asm
new file mode 100644
index 0000000..0602614
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/logops_n.asm
@@ -0,0 +1,253 @@
+dnl  ARM mpn_and_n, mpn_andn_n. mpn_nand_n, etc, optimised for A15.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb             cycles/limb
+C          and andn ior xor         nand iorn nior xnor
+C StrongARM	 ?			 ?
+C XScale	 ?			 ?
+C Cortex-A7	 ?			 ?
+C Cortex-A8	 ?			 ?
+C Cortex-A9	3.5			3.56
+C Cortex-A15	1.27			1.64
+
+C This is great A15 core register code, but it is a bit large.
+C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling.
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`FEEDIN_VARIANT', 1)	C alternatives: 0 1 2
+define(`UNROLL', 4x2)		C alternatives: 4 4x2
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+define(`POSTOP')
+
+ifdef(`OPERATION_and_n',`
+  define(`func',    `mpn_and_n')
+  define(`LOGOP',   `and	$1, $2, $3')')
+ifdef(`OPERATION_andn_n',`
+  define(`func',    `mpn_andn_n')
+  define(`LOGOP',   `bic	$1, $2, $3')')
+ifdef(`OPERATION_nand_n',`
+  define(`func',    `mpn_nand_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `and	$1, $2, $3')')
+ifdef(`OPERATION_ior_n',`
+  define(`func',    `mpn_ior_n')
+  define(`LOGOP',   `orr	$1, $2, $3')')
+ifdef(`OPERATION_iorn_n',`
+  define(`func',    `mpn_iorn_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `bic	$1, $3, $2')')
+ifdef(`OPERATION_nior_n',`
+  define(`func',    `mpn_nior_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `orr	$1, $2, $3')')
+ifdef(`OPERATION_xor_n',`
+  define(`func',    `mpn_xor_n')
+  define(`LOGOP',   `eor	$1, $2, $3')')
+ifdef(`OPERATION_xnor_n',`
+  define(`func',    `mpn_xnor_n')
+  define(`POSTOP',  `mvn	$1, $1')
+  define(`LOGOP',   `eor	$1, $2, $3')')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+	push	{ r4-r9 }
+
+ifelse(FEEDIN_VARIANT,0,`
+	ands	r6, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00a)
+	tst	r6, #1
+	beq	L(bx0)
+	ldr	r5, [up], #4
+	ldr	r7, [vp], #4
+	LOGOP(	r9, r5, r7)
+	POSTOP(	r9)
+	str	r9, [rp], #4
+	tst	r6, #2
+	beq	L(b00)
+L(bx0):	ldrd	r4, r5, [up, #0]
+	ldrd	r6, r7, [vp, #0]
+	sub	rp, rp, #8
+	b	L(lo)
+L(b00):	tst	n, n
+	beq	L(wd1)
+L(b00a):ldrd	r4, r5, [up], #-8
+	ldrd	r6, r7, [vp], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+')
+ifelse(FEEDIN_VARIANT,1,`
+	and	r6, n, #3
+	mov	n, n, lsr #2
+	tst	r6, #1
+	beq	L(bx0)
+	ldr	r5, [up], #4
+	ldr	r7, [vp], #4
+	LOGOP(	r9, r5, r7)
+	POSTOP(	r9)
+	str	r9, [rp], #4
+L(bx0):	tst	r6, #2
+	beq	L(b00)
+	ldrd	r4, r5, [up, #0]
+	ldrd	r6, r7, [vp, #0]
+	sub	rp, rp, #8
+	b	L(lo)
+L(b00):	tst	n, n
+	beq	L(wd1)
+	ldrd	r4, r5, [up], #-8
+	ldrd	r6, r7, [vp], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+')
+ifelse(FEEDIN_VARIANT,2,`
+	ands	r6, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00)
+	cmp	r6, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	ldr	r5, [up], #4
+	ldr	r7, [vp], #4
+	LOGOP(	r9, r5, r7)
+	ldrd	r4, r5, [up, #0]
+	ldrd	r6, r7, [vp, #0]
+	POSTOP(	r9)
+	str	r9, [rp], #-4
+	b	L(lo)
+
+L(b00):	ldrd	r4, r5, [up], #-8
+	ldrd	r6, r7, [vp], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+
+L(b01):	ldr	r5, [up], #-4
+	ldr	r7, [vp], #-4
+	LOGOP(	r9, r5, r7)
+	POSTOP(	r9)
+	str	r9, [rp], #-12
+	tst	n, n
+	beq	L(wd1)
+L(gt1):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	b	L(mid)
+
+L(b10):	ldrd	r4, r5, [up]
+	ldrd	r6, r7, [vp]
+	sub	rp, rp, #8
+	b	L(lo)
+')
+	ALIGN(16)
+ifelse(UNROLL,4,`
+L(top):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #8]
+L(mid):	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	ldrd	r4, r5, [up, #16]!
+	ldrd	r6, r7, [vp, #16]!
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #16]!
+	sub	n, n, #1
+L(lo):	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	tst	n, n
+	bne	L(top)
+')
+ifelse(UNROLL,4x2,`
+L(top):	ldrd	r4, r5, [up, #8]
+	ldrd	r6, r7, [vp, #8]
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #8]
+L(mid):	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	ldrd	r4, r5, [up, #16]
+	ldrd	r6, r7, [vp, #16]
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #16]
+	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	sub	n, n, #2
+	tst	n, n
+	bmi	L(dne)
+	ldrd	r4, r5, [up, #24]
+	ldrd	r6, r7, [vp, #24]
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #24]
+	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	ldrd	r4, r5, [up, #32]!
+	ldrd	r6, r7, [vp, #32]!
+	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #32]!
+L(lo):	LOGOP(	r8, r4, r6)
+	LOGOP(	r9, r5, r7)
+	tst	n, n
+	bne	L(top)
+')
+
+L(end):	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #8]
+L(wd1):	pop	{ r4-r9 }
+	bx	r14
+ifelse(UNROLL,4x2,`
+L(dne):	POSTOP(	r8)
+	POSTOP(	r9)
+	strd	r8, r9, [rp, #24]
+	pop	{ r4-r9 }
+	bx	r14
+')
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/mul_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/mul_1.asm
new file mode 100644
index 0000000..766ba5c
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/mul_1.asm
@@ -0,0 +1,104 @@
+dnl  ARM mpn_mul_1 optimised for A15.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb		best
+C StrongARM:	 -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 5.25			3.25
+C Cortex-A15	 2.25			this
+
+
+C This runs well on A15 but very poorly on A9.  By scheduling loads and adds
+C it is possible to get good A9 performance as well, but at the cost of using
+C many more (callee-saves) registers.
+
+C This is armv5 code, optimized for the armv7a cpu A15.  Its location in the
+C GMP file structure might be misleading.
+
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+define(`v0', `r3')
+
+ASM_START()
+PROLOGUE(mpn_mul_1c)
+	ldr	r12, [sp]
+	b	L(ent)
+EPILOGUE()
+PROLOGUE(mpn_mul_1)
+	mov	r12, #0
+L(ent):	push	{r4-r7}
+
+	ldr	r6, [up], #4
+	tst	n, #1
+	beq	L(bx0)
+
+L(bx1):	umull	r4, r7, r6, v0
+	adds	r4, r4, r12
+	tst	n, #2
+	beq	L(lo1)
+	b	L(lo3)
+
+L(bx0):	umull	r4, r5, r6, v0
+	adds	r4, r4, r12
+	tst	n, #2
+	beq	L(lo0)
+	b	L(lo2)
+
+L(top):	ldr	r6, [up], #4
+	str	r4, [rp], #4
+	umull	r4, r5, r6, v0
+	adds	r4, r4, r7
+L(lo0):	ldr	r6, [up], #4
+	str	r4, [rp], #4
+	umull	r4, r7, r6, v0
+	adcs	r4, r4, r5
+L(lo3):	ldr	r6, [up], #4
+	str	r4, [rp], #4
+	umull	r4, r5, r6, v0
+	adcs	r4, r4, r7
+L(lo2):	ldr	r6, [up], #4
+	str	r4, [rp], #4
+	umull	r4, r7, r6, v0
+	adcs	r4, r4, r5
+L(lo1):	adc	r7, r7, #0
+	subs	n, n, #4
+	bgt	L(top)
+
+	str	r4, [rp]
+	mov	r0, r7
+	pop	{r4-r7}
+	bx	lr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm
new file mode 100644
index 0000000..d8cfe3f
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm
@@ -0,0 +1,43 @@
+dnl  ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH,		1)
+
+ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
+
+include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm
new file mode 100644
index 0000000..b48204d
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm
@@ -0,0 +1,43 @@
+dnl  ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH,		2)
+
+ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
+
+include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm
new file mode 100644
index 0000000..51f93c1
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm
@@ -0,0 +1,144 @@
+dnl  ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 5.25
+C Cortex-A15	 2.25
+
+C TODO
+C  * Consider using 4-way feed-in code.
+C  * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps
+C    insufficiently for A7 and A8.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+ifdef(`DO_add', `
+  define(`ADCSBCS',	`adcs	$1, $2, $3')
+  define(`CLRCY',	`cmn	r13, #1')
+  define(`RETVAL',	`adc	r0, $1, #0')
+  define(`func',	mpn_addlsh`'LSH`'_n)')
+ifdef(`DO_sub', `
+  define(`ADCSBCS',	`sbcs	$1, $2, $3')
+  define(`CLRCY',	`cmp	r13, #0')
+  define(`RETVAL',	`sbc	$2, $2, $2
+			cmn	$2, #1
+			adc	r0, $1, #0')
+  define(`func',	mpn_sublsh`'LSH`'_n)')
+ifdef(`DO_rsb', `
+  define(`ADCSBCS',	`sbcs	$1, $3, $2')
+  define(`CLRCY',	`cmp	r13, #0')
+  define(`RETVAL',	`sbc	r0, $1, #0')
+  define(`func',	mpn_rsblsh`'LSH`'_n)')
+
+
+ASM_START()
+PROLOGUE(func)
+	push	 {r4-r10}
+	vmov.i8	 d0, #0			C could feed carry through here
+	CLRCY
+	tst	n, #1
+	beq	L(bb0)
+
+L(bb1):	vld1.32	 {d3[0]}, [vp]!
+	vsli.u32 d0, d3, #LSH
+	ldr	 r12, [up], #4
+	vmov.32	 r5, d0[0]
+	vshr.u32 d0, d3, #32-LSH
+	ADCSBCS( r12, r12, r5)
+	str	 r12, [rp], #4
+	bics	 n, n, #1
+	beq	 L(rtn)
+
+L(bb0):	tst	n, #2
+	beq	L(b00)
+
+L(b10):	vld1.32	 {d3}, [vp]!
+	vsli.u64 d0, d3, #LSH
+	ldmia	 up!, {r10,r12}
+	vmov	 r4, r5, d0
+	vshr.u64 d0, d3, #64-LSH
+	ADCSBCS( r10, r10, r4)
+	ADCSBCS( r12, r12, r5)
+	stmia	 rp!, {r10,r12}
+	bics	 n, n, #2
+	beq	 L(rtn)
+
+L(b00):	vld1.32	 {d2}, [vp]!
+	vsli.u64 d0, d2, #LSH
+	vshr.u64 d1, d2, #64-LSH
+	vld1.32	 {d3}, [vp]!
+	vsli.u64 d1, d3, #LSH
+	vmov	 r6, r7, d0
+	vshr.u64 d0, d3, #64-LSH
+	sub	 n, n, #4
+	tst	 n, n
+	beq	 L(end)
+
+	ALIGN(16)
+L(top):	ldmia	 up!, {r8,r9,r10,r12}
+	vld1.32	 {d2}, [vp]!
+	vsli.u64 d0, d2, #LSH
+	vmov	 r4, r5, d1
+	vshr.u64 d1, d2, #64-LSH
+	ADCSBCS( r8, r8, r6)
+	ADCSBCS( r9, r9, r7)
+	vld1.32	 {d3}, [vp]!
+	vsli.u64 d1, d3, #LSH
+	vmov	 r6, r7, d0
+	vshr.u64 d0, d3, #64-LSH
+	ADCSBCS( r10, r10, r4)
+	ADCSBCS( r12, r12, r5)
+	stmia	 rp!, {r8,r9,r10,r12}
+	sub	 n, n, #4
+	tst	 n, n
+	bne	 L(top)
+
+L(end):	ldmia	 up!, {r8,r9,r10,r12}
+	vmov	 r4, r5, d1
+	ADCSBCS( r8, r8, r6)
+	ADCSBCS( r9, r9, r7)
+	ADCSBCS( r10, r10, r4)
+	ADCSBCS( r12, r12, r5)
+	stmia	 rp!, {r8,r9,r10,r12}
+L(rtn):	vmov.32	 r0, d0[0]
+	RETVAL(	 r0, r1)
+	pop	 {r4-r10}
+	bx	 r14
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm
new file mode 100644
index 0000000..9e7a629
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm
@@ -0,0 +1,97 @@
+dnl  ARM Neon mpn_com optimised for A15.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 2.1
+C Cortex-A15	 0.65
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+	cmp		n, #7
+	ble		L(bc)
+
+C Perform a few initial operation until rp is 128-bit aligned
+	tst		rp, #4
+	beq		L(al1)
+	vld1.32		{d0[0]}, [up]!
+	sub		n, n, #1
+	vmvn		d0, d0
+	vst1.32		{d0[0]}, [rp]!
+L(al1):	tst		rp, #8
+	beq		L(al2)
+	vld1.32		{d0}, [up]!
+	sub		n, n, #2
+	vmvn		d0, d0
+	vst1.32		{d0}, [rp:64]!
+L(al2):	vld1.32		{q2}, [up]!
+	subs		n, n, #12
+	blt		L(end)
+
+	ALIGN(16)
+L(top):	vld1.32		{q0}, [up]!
+	vmvn		q2, q2
+	subs		n, n, #8
+	vst1.32		{q2}, [rp:128]!
+	vld1.32		{q2}, [up]!
+	vmvn		q0, q0
+	vst1.32		{q0}, [rp:128]!
+	bge	L(top)
+
+L(end):	vmvn		q2, q2
+	vst1.32		{q2}, [rp:128]!
+
+C Handle last 0-7 limbs.  Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc):	tst		n, #4
+	beq		L(tl1)
+	vld1.32		{q0}, [up]!
+	vmvn		q0, q0
+	vst1.32		{q0}, [rp]!
+L(tl1):	tst		n, #2
+	beq		L(tl2)
+	vld1.32		{d0}, [up]!
+	vmvn		d0, d0
+	vst1.32		{d0}, [rp]!
+L(tl2):	tst		n, #1
+	beq		L(tl3)
+	vld1.32		{d0[0]}, [up]
+	vmvn		d0, d0
+	vst1.32		{d0[0]}, [rp]
+L(tl3):	bx		lr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm
new file mode 100644
index 0000000..98fe535
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm
@@ -0,0 +1,110 @@
+dnl  ARM Neon mpn_copyd optimised for A15.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.75		slower than core register code
+C Cortex-A15	 0.52
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+	add	rp, rp, n, lsl #2
+	add	up, up, n, lsl #2
+
+	cmp	n, #7
+	ble	L(bc)
+
+C Copy until rp is 128-bit aligned
+	tst	rp, #4
+	beq	L(al1)
+	sub	up, up, #4
+	vld1.32	{d22[0]}, [up]
+	sub	n, n, #1
+	sub	rp, rp, #4
+	vst1.32	{d22[0]}, [rp]
+L(al1):	tst	rp, #8
+	beq	L(al2)
+	sub	up, up, #8
+	vld1.32	{d22}, [up]
+	sub	n, n, #2
+	sub	rp, rp, #8
+	vst1.32	{d22}, [rp:64]
+L(al2):	sub	up, up, #16
+	vld1.32	{d26-d27}, [up]
+	subs	n, n, #12
+	sub	rp, rp, #16			C offset rp for loop
+	blt	L(end)
+
+	sub	up, up, #16			C offset up for loop
+	mov	r12, #-16
+
+	ALIGN(16)
+L(top):	vld1.32	{d22-d23}, [up], r12
+	vst1.32	{d26-d27}, [rp:128], r12
+	vld1.32	{d26-d27}, [up], r12
+	vst1.32	{d22-d23}, [rp:128], r12
+	subs	n, n, #8
+	bge	L(top)
+
+	add	up, up, #16			C undo up offset
+						C rp offset undoing folded
+L(end):	vst1.32	{d26-d27}, [rp:128]
+
+C Copy last 0-7 limbs.  Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc):	tst	n, #4
+	beq	L(tl1)
+	sub	up, up, #16
+	vld1.32	{d22-d23}, [up]
+	sub	rp, rp, #16
+	vst1.32	{d22-d23}, [rp]
+L(tl1):	tst	n, #2
+	beq	L(tl2)
+	sub	up, up, #8
+	vld1.32	{d22}, [up]
+	sub	rp, rp, #8
+	vst1.32	{d22}, [rp]
+L(tl2):	tst	n, #1
+	beq	L(tl3)
+	sub	up, up, #4
+	vld1.32	{d22[0]}, [up]
+	sub	rp, rp, #4
+	vst1.32	{d22[0]}, [rp]
+L(tl3):	bx	lr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm
new file mode 100644
index 0000000..2e05afe
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm
@@ -0,0 +1,90 @@
+dnl  ARM Neon mpn_copyi optimised for A15.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 1.75		slower than core register code
+C Cortex-A15	 0.52
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+	cmp	n, #7
+	ble	L(bc)
+
+C Copy until rp is 128-bit aligned
+	tst	rp, #4
+	beq	L(al1)
+	vld1.32	{d22[0]}, [up]!
+	sub	n, n, #1
+	vst1.32	{d22[0]}, [rp]!
+L(al1):	tst	rp, #8
+	beq	L(al2)
+	vld1.32	{d22}, [up]!
+	sub	n, n, #2
+	vst1.32	{d22}, [rp:64]!
+L(al2):	vld1.32	{d26-d27}, [up]!
+	subs	n, n, #12
+	blt	L(end)
+
+	ALIGN(16)
+L(top):	vld1.32	{d22-d23}, [up]!
+	vst1.32	{d26-d27}, [rp:128]!
+	vld1.32	{d26-d27}, [up]!
+	vst1.32	{d22-d23}, [rp:128]!
+	subs	n, n, #8
+	bge	L(top)
+
+L(end):	vst1.32	{d26-d27}, [rp:128]!
+
+C Copy last 0-7 limbs.  Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc):	tst	n, #4
+	beq	L(tl1)
+	vld1.32	{d22-d23}, [up]!
+	vst1.32	{d22-d23}, [rp]!
+L(tl1):	tst	n, #2
+	beq	L(tl2)
+	vld1.32	{d22}, [up]!
+	vst1.32	{d22}, [rp]!
+L(tl2):	tst	n, #1
+	beq	L(tl3)
+	vld1.32	{d22[0]}, [up]
+	vst1.32	{d22[0]}, [rp]
+L(tl3):	bx	lr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm
new file mode 100644
index 0000000..2c11d6d
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm
@@ -0,0 +1,177 @@
+dnl  ARM Neon mpn_rsh1add_n, mpn_rsh1sub_n.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb
+C StrongARM	 -
+C XScale	 -
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	4-5
+C Cortex-A15	 2.5
+
+C TODO
+C  * Try to make this smaller, its size (384 bytes) is excessive.
+C  * Try to reach 2.25 c/l on A15, to match the addlsh_1 family.
+C  * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps
+C    insufficiently for A7 and A8.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n',  `r3')
+
+ifdef(`OPERATION_rsh1add_n', `
+  define(`ADDSUBS',	`adds	$1, $2, $3')
+  define(`ADCSBCS',	`adcs	$1, $2, $3')
+  define(`IFADD',	`$1')
+  define(`IFSUB',	`')
+  define(`func',	mpn_rsh1add_n)')
+ifdef(`OPERATION_rsh1sub_n', `
+  define(`ADDSUBS',	`subs	$1, $2, $3')
+  define(`ADCSBCS',	`sbcs	$1, $2, $3')
+  define(`IFADD',	`')
+  define(`IFSUB',	`$1')
+  define(`func',	mpn_rsh1sub_n)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+PROLOGUE(func)
+	push	 {r4-r10}
+
+	ands	r4, n, #3
+	beq	L(b00)
+	cmp	r4, #2
+	blo	L(b01)
+	beq	L(b10)
+
+L(b11):	ldmia	 up!, {r9,r10,r12}
+	ldmia	 vp!, {r5,r6,r7}
+	ADDSUBS( r9, r9, r5)
+	vmov	 d4, r9, r9
+	ADCSBCS( r10, r10, r6)
+	ADCSBCS( r12, r12, r7)
+	vshr.u64 d3, d4, #1
+	vmov	 d1, r10, r12
+	vsli.u64 d3, d1, #31
+	vshr.u64 d2, d1, #1
+	vst1.32	 d3[0], [rp]!
+	bics	 n, n, #3
+	beq	 L(wd2)
+L(gt3):	ldmia	 up!, {r8,r9,r10,r12}
+	ldmia	 vp!, {r4,r5,r6,r7}
+	b	 L(mi0)
+
+L(b10):	ldmia	 up!, {r10,r12}
+	ldmia	 vp!, {r6,r7}
+	ADDSUBS( r10, r10, r6)
+	ADCSBCS( r12, r12, r7)
+	vmov	 d4, r10, r12
+	bics	 n, n, #2
+	vshr.u64 d2, d4, #1
+	beq	 L(wd2)
+L(gt2):	ldmia	 up!, {r8,r9,r10,r12}
+	ldmia	 vp!, {r4,r5,r6,r7}
+	b	 L(mi0)
+
+L(b01):	ldr	 r12, [up], #4
+	ldr	 r7, [vp], #4
+	ADDSUBS( r12, r12, r7)
+	vmov	 d4, r12, r12
+	bics	 n, n, #1
+	bne	 L(gt1)
+	mov	 r5, r12, lsr #1
+IFADD(`	adc	 r1, n, #0')
+IFSUB(`	adc	 r1, n, #1')
+	bfi	 r5, r1, #31, #1
+	str	 r5, [rp]
+	and	 r0, r12, #1
+	pop	 {r4-r10}
+	bx	 r14
+L(gt1):	ldmia	 up!, {r8,r9,r10,r12}
+	ldmia	 vp!, {r4,r5,r6,r7}
+	vshr.u64 d2, d4, #1
+	ADCSBCS( r8, r8, r4)
+	ADCSBCS( r9, r9, r5)
+	vmov	 d0, r8, r9
+	ADCSBCS( r10, r10, r6)
+	ADCSBCS( r12, r12, r7)
+	vsli.u64 d2, d0, #31
+	vshr.u64 d3, d0, #1
+	vst1.32	 d2[0], [rp]!
+	b	 L(mi1)
+
+L(b00):	ldmia	 up!, {r8,r9,r10,r12}
+	ldmia	 vp!, {r4,r5,r6,r7}
+	ADDSUBS( r8, r8, r4)
+	ADCSBCS( r9, r9, r5)
+	vmov	 d4, r8, r9
+	ADCSBCS( r10, r10, r6)
+	ADCSBCS( r12, r12, r7)
+	vshr.u64 d3, d4, #1
+	b	 L(mi1)
+
+	ALIGN(16)
+L(top):	ldmia	 up!, {r8,r9,r10,r12}
+	ldmia	 vp!, {r4,r5,r6,r7}
+	vsli.u64 d3, d1, #63
+	vshr.u64 d2, d1, #1
+	vst1.32	 d3, [rp]!
+L(mi0):	ADCSBCS( r8, r8, r4)
+	ADCSBCS( r9, r9, r5)
+	vmov	 d0, r8, r9
+	ADCSBCS( r10, r10, r6)
+	ADCSBCS( r12, r12, r7)
+	vsli.u64 d2, d0, #63
+	vshr.u64 d3, d0, #1
+	vst1.32	 d2, [rp]!
+L(mi1):	vmov	 d1, r10, r12
+	sub	 n, n, #4
+	tst	 n, n
+	bne	 L(top)
+
+L(end):	vsli.u64 d3, d1, #63
+	vshr.u64 d2, d1, #1
+	vst1.32	 d3, [rp]!
+L(wd2):	vmov	 r4, r5, d2
+IFADD(`	adc	 r1, n, #0')
+IFSUB(`	adc	 r1, n, #1')
+	bfi	 r5, r1, #31, #1
+	stm	 rp, {r4,r5}
+
+L(rtn):	vmov.32	 r0, d4[0]
+	and	 r0, r0, #1
+	pop	 {r4-r10}
+	bx	 r14
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/submul_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/submul_1.asm
new file mode 100644
index 0000000..ed7bfe8
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/submul_1.asm
@@ -0,0 +1,159 @@
+dnl  ARM mpn_submul_1 optimised for A15.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb		best
+C StrongARM:     -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 5.75			3.75
+C Cortex-A15	 2.32			this
+
+C This code uses umlal and umaal for adding in the rp[] data, keeping the
+C recurrency path separate from any multiply instructions.  It performs well on
+C A15, but not quite at the multiply bandwidth like the corresponding addmul_1
+C code.
+C
+C We don't use r12 due to ldrd and strd limitations.
+C
+C This loop complements U on the fly,
+C   U' = B^n - 1 - U
+C and then uses that
+C   R - U*v = R + U'*v + v - B^n v
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	umaal
+C v6t2	-
+C v7a	-
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+define(`v0', `r3')
+
+define(`w0', `r10') define(`w1', `r11')
+define(`u0', `r8')  define(`u1', `r9')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	sub	sp, sp, #32
+	strd	r10, r11, [sp, #24]
+	strd	r8, r9, [sp, #16]
+	strd	r6, r7, [sp, #8]
+	strd	r4, r5, [sp, #0]
+C	push	{ r4-r11 }
+
+	ands	r6, n, #3
+	sub	n, n, #3
+	beq	L(b00)
+	cmp	r6, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	mov	r6, #0
+	ldr	u1, [up], #-4
+	ldr	w1, [rp], #-16
+	mvn	u1, u1
+	adds	r7, v0, #0
+	b	L(mid)
+
+L(b00):	ldrd	u0, u1, [up]
+	ldrd	w0, w1, [rp], #-12
+	mvn	u0, u0
+	mvn	u1, u1
+	mov	r6, v0
+	umaal	w0, r6, u0, v0
+	cmn	r13, #0			C carry clear
+	mov	r7, #0
+	str	w0, [rp, #12]
+	b	L(mid)
+
+L(b10):	ldrd	u0, u1, [up], #8
+	ldrd	w0, w1, [rp]
+	mvn	u0, u0
+	mvn	u1, u1
+	mov	r4, v0
+	umaal	w0, r4, u0, v0
+	mov	r5, #0
+	str	w0, [rp], #-4
+	umlal	w1, r5, u1, v0
+	adds	n, n, #0
+	bmi	L(end)
+	b	L(top)
+
+L(b01):	ldr	u1, [up], #4
+	ldr	w1, [rp], #-8
+	mvn	u1, u1
+	mov	r5, v0
+	mov	r4, #0
+	umaal	w1, r5, u1, v0
+	tst	n, n
+	bmi	L(end)
+
+C	ALIGN(16)
+L(top):	ldrd	u0, u1, [up, #0]
+	adcs	r4, r4, w1
+	mvn	u0, u0
+	ldrd	w0, w1, [rp, #12]
+	mvn	u1, u1
+	mov	r6, #0
+	umlal	w0, r6, u0, v0		C 1 2
+	adcs	r5, r5, w0
+	mov	r7, #0
+	strd	r4, r5, [rp, #8]
+L(mid):	umaal	w1, r7, u1, v0		C 2 3
+	ldrd	u0, u1, [up, #8]
+	add	up, up, #16
+	adcs	r6, r6, w1
+	mvn	u0, u0
+	ldrd	w0, w1, [rp, #20]
+	mvn	u1, u1
+	mov	r4, #0
+	umlal	w0, r4, u0, v0		C 3 4
+	adcs	r7, r7, w0
+	mov	r5, #0
+	strd	r6, r7, [rp, #16]!
+	sub	n, n, #4
+	umlal	w1, r5, u1, v0		C 0 1
+	tst	n, n
+	bpl	L(top)
+
+L(end):	adcs	r4, r4, w1
+	str	r4, [rp, #8]
+	adc	r0, r5, #0
+	sub	r0, v0, r0
+	pop	{ r4-r11 }
+	bx	r14
+EPILOGUE()
-- 
cgit v1.2.3