aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/arm/v6
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/arm/v6')
-rw-r--r--gmp-6.3.0/mpn/arm/v6/addmul_1.asm112
-rw-r--r--gmp-6.3.0/mpn/arm/v6/addmul_2.asm125
-rw-r--r--gmp-6.3.0/mpn/arm/v6/addmul_3.asm191
-rw-r--r--gmp-6.3.0/mpn/arm/v6/dive_1.asm149
-rw-r--r--gmp-6.3.0/mpn/arm/v6/gmp-mparam.h187
-rw-r--r--gmp-6.3.0/mpn/arm/v6/mode1o.asm95
-rw-r--r--gmp-6.3.0/mpn/arm/v6/mul_1.asm115
-rw-r--r--gmp-6.3.0/mpn/arm/v6/mul_2.asm135
-rw-r--r--gmp-6.3.0/mpn/arm/v6/popham.asm139
-rw-r--r--gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm544
-rw-r--r--gmp-6.3.0/mpn/arm/v6/submul_1.asm125
11 files changed, 1917 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/arm/v6/addmul_1.asm b/gmp-6.3.0/mpn/arm/v6/addmul_1.asm
new file mode 100644
index 0000000..a38af58
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v6/addmul_1.asm
@@ -0,0 +1,112 @@
+dnl ARM mpn_addmul_1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: -
+C XScale -
+C ARM11 6.4
+C Cortex-A7 5.25
+C Cortex-A8 7
+C Cortex-A9 3.25
+C Cortex-A15 4
+
+C TODO
+C * Micro-optimise feed-in code.
+C * Optimise for n=1,2 by delaying register saving.
+C * Try using ldm/stm.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`v0',`r3')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+ stmfd sp!, { r4, r5, r6, r7 }
+
+ ands r6, n, #3
+ mov r12, #0
+ beq L(fi0)
+ cmp r6, #2
+ bcc L(fi1)
+ beq L(fi2)
+
+L(fi3): ldr r4, [up], #4
+ ldr r6, [rp, #0]
+ ldr r5, [up], #4
+ b L(lo3)
+
+L(fi0): ldr r5, [up], #4
+ ldr r7, [rp], #4
+ ldr r4, [up], #4
+ b L(lo0)
+
+L(fi1): ldr r4, [up], #4
+ ldr r6, [rp], #8
+ subs n, n, #1
+ beq L(1)
+ ldr r5, [up], #4
+ b L(lo1)
+
+L(fi2): ldr r5, [up], #4
+ ldr r7, [rp], #12
+ ldr r4, [up], #4
+ b L(lo2)
+
+ ALIGN(16)
+L(top): ldr r6, [rp, #-8]
+ ldr r5, [up], #4
+ str r7, [rp, #-12]
+L(lo1): umaal r6, r12, r4, v0
+ ldr r7, [rp, #-4]
+ ldr r4, [up], #4
+ str r6, [rp, #-8]
+L(lo0): umaal r7, r12, r5, v0
+ ldr r6, [rp, #0]
+ ldr r5, [up], #4
+ str r7, [rp, #-4]
+L(lo3): umaal r6, r12, r4, v0
+ ldr r7, [rp, #4]
+ ldr r4, [up], #4
+ str r6, [rp], #16
+L(lo2): umaal r7, r12, r5, v0
+ subs n, n, #4
+ bhi L(top)
+
+ ldr r6, [rp, #-8]
+ str r7, [rp, #-12]
+L(1): umaal r6, r12, r4, v0
+ str r6, [rp, #-8]
+ mov r0, r12
+ ldmfd sp!, { r4, r5, r6, r7 }
+ bx lr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v6/addmul_2.asm b/gmp-6.3.0/mpn/arm/v6/addmul_2.asm
new file mode 100644
index 0000000..69d0b8f
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v6/addmul_2.asm
@@ -0,0 +1,125 @@
+dnl ARM mpn_addmul_2.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: -
+C XScale -
+C ARM11 4.68
+C Cortex-A5 3.63
+C Cortex-A7 3.65
+C Cortex-A8 4.0
+C Cortex-A9 2.25
+C Cortex-A15 2.5
+C Cortex-A17 2.13
+C Cortex-A53 3.5
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`vp',`r3')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`u0',`r3')
+define(`u1',`r9')
+
+define(`cya',`r8')
+define(`cyb',`r12')
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_2)
+ push { r4-r9 }
+
+ ldrd v0, v1, [vp, #0]
+ mov cya, #0
+ mov cyb, #0
+
+ tst n, #1
+ beq L(evn)
+
+L(odd): ldr u1, [up, #0]
+ ldr r4, [rp, #0]
+ tst n, #2
+ beq L(fi1)
+L(fi3): sub up, up, #8
+ sub rp, rp, #8
+ b L(lo3)
+L(fi1): sub n, n, #1
+ b L(top)
+
+L(evn): ldr u0, [up, #0]
+ ldr r5, [rp, #0]
+ tst n, #2
+ bne L(fi2)
+L(fi0): sub up, up, #4
+ sub rp, rp, #4
+ b L(lo0)
+L(fi2): sub up, up, #12
+ sub rp, rp, #12
+ b L(lo2)
+
+ ALIGN(16)
+L(top): ldr r5, [rp, #4]
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #4]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #0]
+L(lo0): ldr r4, [rp, #8]
+ umaal r5, cya, u0, v0
+ ldr u1, [up, #8]
+ umaal r4, cyb, u0, v1
+ str r5, [rp, #4]
+L(lo3): ldr r5, [rp, #12]
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #12]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #8]
+L(lo2): ldr r4, [rp, #16]!
+ umaal r5, cya, u0, v0
+ ldr u1, [up, #16]!
+ umaal r4, cyb, u0, v1
+ str r5, [rp, #-4]
+ subs n, n, #4
+ bhi L(top)
+
+L(end): umaal r4, cya, u1, v0
+ umaal cya, cyb, u1, v1
+ str r4, [rp, #0]
+ str cya, [rp, #4]
+ mov r0, cyb
+
+ pop { r4-r9 }
+ bx r14
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v6/addmul_3.asm b/gmp-6.3.0/mpn/arm/v6/addmul_3.asm
new file mode 100644
index 0000000..d1490cd
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v6/addmul_3.asm
@@ -0,0 +1,191 @@
+dnl ARM mpn_addmul_3.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: -
+C XScale -
+C ARM11 4.33
+C Cortex-A5 3.28
+C Cortex-A7 3.25
+C Cortex-A8 3.17
+C Cortex-A9 2.125
+C Cortex-A15 2
+C Cortex-A17 2.11
+C Cortex-A53 4.18
+
+C TODO
+C * Use a fast path for n <= KARATSUBA_MUL_THRESHOLD using a jump table,
+C avoiding the current multiply.
+C * Start the first multiply or multiplies early.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`vp',`r3')
+
+define(`v0',`r4') define(`v1',`r5') define(`v2',`r6')
+define(`u0',`r3') define(`u1',`r14')
+define(`w0',`r7') define(`w1',`r8') define(`w2',`r9')
+define(`cy0',`r10') define(`cy1',`r11') define(`cy2',`r12')
+
+
+ASM_START()
+PROLOGUE(mpn_addmul_3)
+ push { r4-r11, r14 }
+
+ ldr w0, =0xaaaaaaab C 3^{-1} mod 2^32
+ ldm vp, { v0,v1,v2 }
+ mov cy0, #0
+ mov cy1, #0
+ mov cy2, #0
+
+C Tricky n mod 6
+ mul w0, w0, n C n * 3^{-1} mod 2^32
+ and w0, w0, #0xc0000001 C pseudo-CRT mod 3,2
+ sub n, n, #3
+ifdef(`PIC',`
+ add pc, pc, w0, ror $28
+ nop
+ b L(b0)
+ b L(b2)
+ b L(b4)
+ .word 0xe7f000f0 C udf
+ b L(b3)
+ b L(b5)
+ b L(b1)
+',`
+ ldr pc, [pc, w0, ror $28]
+ nop
+ .word L(b0), L(b2), L(b4), 0, L(b3), L(b5), L(b1)
+')
+
+L(b5): add up, up, #-8
+ ldr w1, [rp, #0]
+ ldr w2, [rp, #4]
+ ldr u1, [up, #8]
+ b L(lo5)
+
+L(b4): add rp, rp, #-4
+ add up, up, #-12
+ ldr w2, [rp, #4]
+ ldr w0, [rp, #8]
+ ldr u0, [up, #12]
+ b L(lo4)
+
+L(b3): add rp, rp, #-8
+ add up, up, #-16
+ ldr w0, [rp, #8]
+ ldr w1, [rp, #12]
+ ldr u1, [up, #16]
+ b L(lo3)
+
+L(b1): add rp, rp, #8
+ ldr w2, [rp, #-8]
+ ldr w0, [rp, #-4]
+ ldr u1, [up, #0]
+ b L(lo1)
+
+L(b0): add rp, rp, #4
+ add up, up, #-4
+ ldr w0, [rp, #-4]
+ ldr w1, [rp, #0]
+ ldr u0, [up, #4]
+ b L(lo0)
+
+L(b2): add rp, rp, #12
+ add up, up, #4
+ ldr w1, [rp, #-12]
+ ldr w2, [rp, #-8]
+ ldr u0, [up, #-4]
+
+ ALIGN(16)
+L(top): ldr w0, [rp, #-4]
+ umaal w1, cy0, u0, v0
+ ldr u1, [up, #0]
+ umaal w2, cy1, u0, v1
+ str w1, [rp, #-12]
+ umaal w0, cy2, u0, v2
+L(lo1): ldr w1, [rp, #0]
+ umaal w2, cy0, u1, v0
+ ldr u0, [up, #4]
+ umaal w0, cy1, u1, v1
+ str w2, [rp, #-8]
+ umaal w1, cy2, u1, v2
+L(lo0): ldr w2, [rp, #4]
+ umaal w0, cy0, u0, v0
+ ldr u1, [up, #8]
+ umaal w1, cy1, u0, v1
+ str w0, [rp, #-4]
+ umaal w2, cy2, u0, v2
+L(lo5): ldr w0, [rp, #8]
+ umaal w1, cy0, u1, v0
+ ldr u0, [up, #12]
+ umaal w2, cy1, u1, v1
+ str w1, [rp, #0]
+ umaal w0, cy2, u1, v2
+L(lo4): ldr w1, [rp, #12]
+ umaal w2, cy0, u0, v0
+ ldr u1, [up, #16]
+ umaal w0, cy1, u0, v1
+ str w2, [rp, #4]
+ umaal w1, cy2, u0, v2
+L(lo3): ldr w2, [rp, #16]
+ umaal w0, cy0, u1, v0
+ ldr u0, [up, #20]
+ umaal w1, cy1, u1, v1
+ str w0, [rp, #8]
+ umaal w2, cy2, u1, v2
+L(lo2): subs n, n, #6
+ add up, up, #24
+ add rp, rp, #24
+ bge L(top)
+
+L(end): umaal w1, cy0, u0, v0
+ ldr u1, [up, #0]
+ umaal w2, cy1, u0, v1
+ str w1, [rp, #-12]
+ mov w0, #0
+ umaal w0, cy2, u0, v2
+ umaal w2, cy0, u1, v0
+ umaal w0, cy1, u1, v1
+ str w2, [rp, #-8]
+ umaal cy1, cy2, u1, v2
+ adds w0, w0, cy0
+ str w0, [rp, #-4]
+ adcs w1, cy1, #0
+ str w1, [rp, #0]
+ adc r0, cy2, #0
+
+ pop { r4-r11, pc }
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v6/dive_1.asm b/gmp-6.3.0/mpn/arm/v6/dive_1.asm
new file mode 100644
index 0000000..92de814
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v6/dive_1.asm
@@ -0,0 +1,149 @@
+dnl ARM v6 mpn_divexact_1
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb
+C norm unorm modexact_1c_odd
+C StrongARM - -
+C XScale - -
+C Cortex-A7 ? ?
+C Cortex-A8 ? ?
+C Cortex-A9 9 10 9
+C Cortex-A15 7 7 7
+
+C Architecture requirements:
+C v5 -
+C v5t clz
+C v5te -
+C v6 umaal
+C v6t2 -
+C v7a -
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+define(`d', `r3')
+
+define(`cy', `r7')
+define(`cnt', `r6')
+define(`tnc', `r10')
+
+ASM_START()
+PROLOGUE(mpn_divexact_1)
+ push {r4,r5,r6,r7,r8,r9}
+
+ tst d, #1
+
+ rsb r4, d, #0
+ and r4, r4, d
+ clz r4, r4
+ rsb cnt, r4, #31 C count_trailing_zeros
+ mov d, d, lsr cnt
+
+C binvert limb
+ LEA( r4, binvert_limb_table)
+ and r12, d, #254
+ ldrb r4, [r4, r12, lsr #1]
+ mul r12, r4, r4
+ mul r12, d, r12
+ rsb r12, r12, r4, lsl #1
+ mul r4, r12, r12
+ mul r4, d, r4
+ rsb r4, r4, r12, lsl #1 C r4 = inverse
+
+ ldr r5, [up], #4 C up[0]
+ mov cy, #0
+ rsb r8, r4, #0 C r8 = -inverse
+ beq L(unnorm)
+
+L(norm):
+ subs n, n, #1
+ mul r5, r5, r4
+ beq L(end)
+
+ ALIGN(16)
+L(top): ldr r9, [up], #4
+ mov r12, #0
+ str r5, [rp], #4
+ umaal r12, cy, r5, d
+ mul r5, r9, r4
+ mla r5, cy, r8, r5
+ subs n, n, #1
+ bne L(top)
+
+L(end): str r5, [rp]
+ pop {r4,r5,r6,r7,r8,r9}
+ bx r14
+
+L(unnorm):
+ push {r10,r11}
+ rsb tnc, cnt, #32
+ mov r11, r5, lsr cnt
+ subs n, n, #1
+ beq L(edx)
+
+ ldr r12, [up], #4
+ orr r9, r11, r12, lsl tnc
+ mov r11, r12, lsr cnt
+ mul r5, r9, r4
+ subs n, n, #1
+ beq L(edu)
+
+ ALIGN(16)
+L(tpu): ldr r12, [up], #4
+ orr r9, r11, r12, lsl tnc
+ mov r11, r12, lsr cnt
+ mov r12, #0
+ str r5, [rp], #4
+ umaal r12, cy, r5, d
+ mul r5, r9, r4
+ mla r5, cy, r8, r5
+ subs n, n, #1
+ bne L(tpu)
+
+L(edu): str r5, [rp], #4
+ mov r12, #0
+ umaal r12, cy, r5, d
+ mul r5, r11, r4
+ mla r5, cy, r8, r5
+ str r5, [rp]
+ pop {r10,r11}
+ pop {r4,r5,r6,r7,r8,r9}
+ bx r14
+
+L(edx): mul r5, r11, r4
+ str r5, [rp]
+ pop {r10,r11}
+ pop {r4,r5,r6,r7,r8,r9}
+ bx r14
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v6/gmp-mparam.h b/gmp-6.3.0/mpn/arm/v6/gmp-mparam.h
new file mode 100644
index 0000000..35a7c55
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v6/gmp-mparam.h
@@ -0,0 +1,187 @@
+/* gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 2019 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 32
+#define GMP_LIMB_BYTES 4
+
+/* 700 MHz ARM11 (raspberry pi) */
+/* FFT tuning limit = 8,088,775 */
+/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */
+
+#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */
+#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1_NORM_THRESHOLD 0 /* always */
+#define MOD_1_UNNORM_THRESHOLD 0 /* always */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 7
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 6
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 19
+#define USE_PREINV_DIVREM_1 1 /* preinv always */
+#define DIV_QR_1N_PI1_METHOD 1 /* 71.61% faster than 2 */
+#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVEXACT_1_THRESHOLD 0 /* always (native) */
+#define BMOD_1_TO_MOD_1_THRESHOLD 38
+
+#define DIV_1_VS_MUL_1_PERCENT 251
+
+#define MUL_TOOM22_THRESHOLD 38
+#define MUL_TOOM33_THRESHOLD 134
+#define MUL_TOOM44_THRESHOLD 512
+#define MUL_TOOM6H_THRESHOLD 0 /* always */
+#define MUL_TOOM8H_THRESHOLD 620
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 209
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 625
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 209
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 211
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 300
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 55
+#define SQR_TOOM3_THRESHOLD 200
+#define SQR_TOOM4_THRESHOLD 470
+#define SQR_TOOM6_THRESHOLD 614
+#define SQR_TOOM8_THRESHOLD 882
+
+#define MULMID_TOOM42_THRESHOLD 62
+
+#define MULMOD_BNM1_THRESHOLD 23
+#define SQRMOD_BNM1_THRESHOLD 26
+
+#define MUL_FFT_MODF_THRESHOLD 565 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 15, 5}, { 31, 6}, { 28, 7}, { 15, 6}, \
+ { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \
+ { 39, 7}, { 21, 6}, { 43, 7}, { 23, 6}, \
+ { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \
+ { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \
+ { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \
+ { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \
+ { 71, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \
+ { 99, 9}, { 55,10}, { 31, 9}, { 79,10}, \
+ { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \
+ { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \
+ { 191,10}, { 111,11}, { 63,10}, { 159,11}, \
+ { 95,10}, { 207,12}, { 63,11}, { 127,10}, \
+ { 255, 9}, { 511,10}, { 271,11}, { 159,10}, \
+ { 351,11}, { 191,10}, { 399,11}, { 223,12}, \
+ { 127,11}, { 255,10}, { 511,11}, { 287,10}, \
+ { 607,11}, { 319,10}, { 639,11}, { 351,12}, \
+ { 191,11}, { 415,13}, { 127,12}, { 255,11}, \
+ { 575,12}, { 319,11}, { 671,12}, { 383,11}, \
+ { 799,12}, { 447,13}, { 255,12}, { 511,11}, \
+ { 1023,12}, { 703,13}, { 383,12}, { 895,14}, \
+ { 255,13}, { 511,12}, { 1151,13}, { 639,12}, \
+ { 1343,13}, { 767,12}, { 1599,13}, { 895,14}, \
+ { 16384,15}, { 32768,16} }
+#define MUL_FFT_TABLE3_SIZE 98
+#define MUL_FFT_THRESHOLD 5760
+
+#define SQR_FFT_MODF_THRESHOLD 530 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 530, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \
+ { 28, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \
+ { 35, 7}, { 19, 6}, { 39, 7}, { 21, 6}, \
+ { 43, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \
+ { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \
+ { 23, 7}, { 49, 8}, { 27, 7}, { 55, 8}, \
+ { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \
+ { 55, 9}, { 31, 8}, { 71, 9}, { 39, 8}, \
+ { 83, 9}, { 47, 8}, { 95, 9}, { 55,10}, \
+ { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \
+ { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \
+ { 167,10}, { 95, 9}, { 191,10}, { 111,11}, \
+ { 63,10}, { 143, 9}, { 287,10}, { 159,11}, \
+ { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \
+ { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \
+ { 543,10}, { 287,11}, { 159,10}, { 351,11}, \
+ { 191,10}, { 415,11}, { 223,12}, { 127,11}, \
+ { 255,10}, { 543,11}, { 287,10}, { 607,11}, \
+ { 319,10}, { 639,11}, { 351,12}, { 191,11}, \
+ { 383,10}, { 767,11}, { 415,13}, { 127,12}, \
+ { 255,11}, { 607,12}, { 319,11}, { 703,12}, \
+ { 383,11}, { 799,12}, { 447,11}, { 895,13}, \
+ { 255,12}, { 511,11}, { 1023,12}, { 703,13}, \
+ { 383,12}, { 895,14}, { 255,13}, { 511,12}, \
+ { 1151,13}, { 639,12}, { 1343,13}, { 767,12}, \
+ { 1599,13}, { 895,14}, { 16384,15}, { 32768,16} }
+#define SQR_FFT_TABLE3_SIZE 104
+#define SQR_FFT_THRESHOLD 4416
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 51
+#define MULLO_MUL_N_THRESHOLD 11278
+#define SQRLO_BASECASE_THRESHOLD 10
+#define SQRLO_DC_THRESHOLD 55
+#define SQRLO_SQR_THRESHOLD 8648
+
+#define DC_DIV_QR_THRESHOLD 36
+#define DC_DIVAPPR_Q_THRESHOLD 146
+#define DC_BDIV_QR_THRESHOLD 46
+#define DC_BDIV_Q_THRESHOLD 160
+
+#define INV_MULMOD_BNM1_THRESHOLD 74
+#define INV_NEWTON_THRESHOLD 145
+#define INV_APPR_THRESHOLD 147
+
+#define BINV_NEWTON_THRESHOLD 372
+#define REDC_1_TO_REDC_2_THRESHOLD 6
+#define REDC_2_TO_REDC_N_THRESHOLD 140
+
+#define MU_DIV_QR_THRESHOLD 2801
+#define MU_DIVAPPR_Q_THRESHOLD 2801
+#define MUPI_DIV_QR_THRESHOLD 79
+#define MU_BDIV_QR_THRESHOLD 2541
+#define MU_BDIV_Q_THRESHOLD 2764
+
+#define POWM_SEC_TABLE 3,20,139,734
+
+#define GET_STR_DC_THRESHOLD 27
+#define GET_STR_PRECOMPUTE_THRESHOLD 45
+#define SET_STR_DC_THRESHOLD 342
+#define SET_STR_PRECOMPUTE_THRESHOLD 1290
+
+#define FAC_DSC_THRESHOLD 390
+#define FAC_ODD_THRESHOLD 438
+
+#define MATRIX22_STRASSEN_THRESHOLD 25
+#define HGCD2_DIV1_METHOD 5 /* 1.32% faster than 3 */
+#define HGCD_THRESHOLD 82
+#define HGCD_APPR_THRESHOLD 81
+#define HGCD_REDUCE_THRESHOLD 4633
+#define GCD_DC_THRESHOLD 345
+#define GCDEXT_DC_THRESHOLD 268
+#define JACOBI_BASE_METHOD 1 /* 3.30% faster than 2 */
+
+/* Tuneup completed successfully, took 45018 seconds */
diff --git a/gmp-6.3.0/mpn/arm/v6/mode1o.asm b/gmp-6.3.0/mpn/arm/v6/mode1o.asm
new file mode 100644
index 0000000..a2f77a6
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v6/mode1o.asm
@@ -0,0 +1,95 @@
+dnl ARM v6 mpn_modexact_1c_odd
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 9
+C Cortex-A15 7
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te smulbb
+C v6 umaal
+C v6t2 -
+C v7a -
+
+define(`up', `r0')
+define(`n', `r1')
+define(`d', `r2')
+define(`cy', `r3')
+
+ .protected binvert_limb_table
+ASM_START()
+PROLOGUE(mpn_modexact_1c_odd)
+ stmfd sp!, {r4, r5, r6, r7}
+
+ LEA( r4, binvert_limb_table)
+
+ ldr r6, [up], #4 C up[0]
+
+ and r12, d, #254
+ ldrb r4, [r4, r12, lsr #1]
+ smulbb r12, r4, r4
+ mul r12, d, r12
+ rsb r12, r12, r4, asl #1
+ mul r4, r12, r12
+ mul r4, d, r4
+ rsb r4, r4, r12, asl #1 C r4 = inverse
+
+ subs n, n, #1
+ sub r6, r6, cy
+ mul r6, r6, r4
+ beq L(end)
+
+ rsb r5, r4, #0 C r5 = -inverse
+
+L(top): ldr r7, [up], #4
+ mov r12, #0
+ umaal r12, cy, r6, d
+ mul r6, r7, r4
+ mla r6, cy, r5, r6
+ subs n, n, #1
+ bne L(top)
+
+L(end): mov r12, #0
+ umaal r12, cy, r6, d
+ mov r0, cy
+
+ ldmfd sp!, {r4, r5, r6, r7}
+ bx r14
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v6/mul_1.asm b/gmp-6.3.0/mpn/arm/v6/mul_1.asm
new file mode 100644
index 0000000..3c6ef99
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v6/mul_1.asm
@@ -0,0 +1,115 @@
+dnl ARM mpn_mul_1.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: -
+C XScale -
+C ARM11 6.4
+C Cortex-A7 5.25
+C Cortex-A8 7
+C Cortex-A9 3.25
+C Cortex-A15 4
+
+C TODO
+C * Micro-optimise feed-in code.
+C * Optimise for n=1,2 by delaying register saving.
+C * Try using ldm/stm.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`v0',`r3')
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ stmfd sp!, { r4, r5, r6, r7 }
+
+ ands r6, n, #3
+ mov r12, #0
+ beq L(fi0)
+ cmp r6, #2
+ bcc L(fi1)
+ beq L(fi2)
+
+L(fi3): ldr r4, [up], #4
+ mov r6, #0
+ ldr r5, [up], #4
+ b L(lo3)
+
+L(fi0): ldr r5, [up], #4
+ add rp, rp, #4
+ mov r7, #0
+ ldr r4, [up], #4
+ b L(lo0)
+
+L(fi1): ldr r4, [up], #4
+ mov r6, #0
+ add rp, rp, #8
+ subs n, n, #1
+ beq L(1)
+ ldr r5, [up], #4
+ b L(lo1)
+
+L(fi2): ldr r5, [up], #4
+ add rp, rp, #12
+ mov r7, #0
+ ldr r4, [up], #4
+ b L(lo2)
+
+ ALIGN(16)
+L(top): mov r6, #0
+ ldr r5, [up], #4
+ str r7, [rp, #-12]
+L(lo1): umaal r6, r12, r4, v0
+ mov r7, #0
+ ldr r4, [up], #4
+ str r6, [rp, #-8]
+L(lo0): umaal r7, r12, r5, v0
+ mov r6, #0
+ ldr r5, [up], #4
+ str r7, [rp, #-4]
+L(lo3): umaal r6, r12, r4, v0
+ mov r7, #0
+ ldr r4, [up], #4
+ str r6, [rp], #16
+L(lo2): umaal r7, r12, r5, v0
+ subs n, n, #4
+ bhi L(top)
+
+ mov r6, #0
+ str r7, [rp, #-12]
+L(1): umaal r6, r12, r4, v0
+ str r6, [rp, #-8]
+ mov r0, r12
+ ldmfd sp!, { r4, r5, r6, r7 }
+ bx lr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v6/mul_2.asm b/gmp-6.3.0/mpn/arm/v6/mul_2.asm
new file mode 100644
index 0000000..edd27f3
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v6/mul_2.asm
@@ -0,0 +1,135 @@
+dnl ARM mpn_mul_2.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: -
+C XScale -
+C ARM11 5.25
+C Cortex-A5 3.63
+C Cortex-A7 3.15
+C Cortex-A8 5.0
+C Cortex-A9 2.25
+C Cortex-A15 2.5
+C Cortex-A17 2.13
+C Cortex-A53 3.5
+
+C TODO
+C * This is a trivial edit of the addmul_2 code. Check for simplifications,
+C and possible speedups to 2.0 c/l.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`vp',`r3')
+
+define(`v0',`r6')
+define(`v1',`r7')
+define(`u0',`r3')
+define(`u1',`r9')
+
+define(`cya',`r8')
+define(`cyb',`r12')
+
+
+ASM_START()
+PROLOGUE(mpn_mul_2)
+ push { r4, r5, r6, r7, r8, r9 }
+
+ ldm vp, { v0, v1 }
+ mov cya, #0
+ mov cyb, #0
+
+ tst n, #1
+ beq L(evn)
+L(odd): mov r5, #0
+ ldr u0, [up, #0]
+ mov r4, #0
+ tst n, #2
+ beq L(fi1)
+L(fi3): sub up, up, #12
+ sub rp, rp, #16
+ b L(lo3)
+L(fi1): sub n, n, #1
+ sub up, up, #4
+ sub rp, rp, #8
+ b L(lo1)
+L(evn): mov r4, #0
+ ldr u1, [up, #0]
+ mov r5, #0
+ tst n, #2
+ bne L(fi2)
+L(fi0): sub up, up, #8
+ sub rp, rp, #12
+ b L(lo0)
+L(fi2): subs n, n, #2
+ sub rp, rp, #4
+ bls L(end)
+
+ ALIGN(16)
+L(top): ldr u0, [up, #4]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #4]
+ mov r4, #0
+ umaal r5, cyb, u1, v1
+L(lo1): ldr u1, [up, #8]
+ umaal r5, cya, u0, v0
+ str r5, [rp, #8]
+ mov r5, #0
+ umaal r4, cyb, u0, v1
+L(lo0): ldr u0, [up, #12]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #12]
+ mov r4, #0
+ umaal r5, cyb, u1, v1
+L(lo3): ldr u1, [up, #16]!
+ umaal r5, cya, u0, v0
+ str r5, [rp, #16]!
+ mov r5, #0
+ umaal r4, cyb, u0, v1
+ subs n, n, #4
+ bhi L(top)
+
+L(end): umaal r4, cya, u1, v0
+ ldr u0, [up, #4]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #4]
+ umaal r5, cya, u0, v0
+ umaal cya, cyb, u0, v1
+ str r5, [rp, #8]
+ str cya, [rp, #12]
+ mov r0, cyb
+
+ pop { r4, r5, r6, r7, r8, r9 }
+ bx r14
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v6/popham.asm b/gmp-6.3.0/mpn/arm/v6/popham.asm
new file mode 100644
index 0000000..c254368
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v6/popham.asm
@@ -0,0 +1,139 @@
+dnl ARM mpn_popcount and mpn_hamdist.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C popcount hamdist
+C cycles/limb cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 8.94 9.47
+C Cortex-A15 5.67 6.44
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te ldrd strd
+C v6 usada8
+C v6t2 -
+C v7a -
+
+ifdef(`OPERATION_popcount',`
+ define(`func',`mpn_popcount')
+ define(`ap', `r0')
+ define(`n', `r1')
+ define(`a0', `r2')
+ define(`a1', `r3')
+ define(`s', `r5')
+ define(`b_01010101', `r6')
+ define(`b_00110011', `r7')
+ define(`b_00001111', `r8')
+ define(`zero', `r9')
+ define(`POPC', `$1')
+ define(`HAMD', `dnl')
+')
+ifdef(`OPERATION_hamdist',`
+ define(`func',`mpn_hamdist')
+ define(`ap', `r0')
+ define(`bp', `r1')
+ define(`n', `r2')
+ define(`a0', `r6')
+ define(`a1', `r7')
+ define(`b0', `r4')
+ define(`b1', `r5')
+ define(`s', `r11')
+ define(`b_01010101', `r8')
+ define(`b_00110011', `r9')
+ define(`b_00001111', `r10')
+ define(`zero', `r3')
+ define(`POPC', `dnl')
+ define(`HAMD', `$1')
+')
+
+MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
+
+ASM_START()
+PROLOGUE(func)
+POPC(` push { r4-r9 } ')
+HAMD(` push { r4-r11 } ')
+
+ ldr b_01010101, =0x55555555
+ mov r12, #0
+ ldr b_00110011, =0x33333333
+ mov zero, #0
+ ldr b_00001111, =0x0f0f0f0f
+
+ tst n, #1
+ beq L(evn)
+
+L(odd): ldr a1, [ap], #4 C 1 x 32 1-bit accumulators, 0-1
+HAMD(` ldr b1, [bp], #4 ') C 1 x 32 1-bit accumulators, 0-1
+HAMD(` eor a1, a1, b1 ')
+ and r4, b_01010101, a1, lsr #1
+ sub a1, a1, r4
+ and r4, a1, b_00110011
+ bic r5, a1, b_00110011
+ add r5, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4
+ subs n, n, #1
+ b L(mid)
+
+L(evn): mov s, #0
+
+L(top): ldrd a0, a1, [ap], #8 C 2 x 32 1-bit accumulators, 0-1
+HAMD(` ldrd b0, b1, [bp], #8')
+HAMD(` eor a0, a0, b0 ')
+HAMD(` eor a1, a1, b1 ')
+ subs n, n, #2
+ usada8 r12, s, zero, r12
+ and r4, b_01010101, a0, lsr #1
+ sub a0, a0, r4
+ and r4, b_01010101, a1, lsr #1
+ sub a1, a1, r4
+ and r4, a0, b_00110011
+ bic r5, a0, b_00110011
+ add a0, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4
+ and r4, a1, b_00110011
+ bic r5, a1, b_00110011
+ add a1, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4
+ add r5, a0, a1 C 8 4-bit accumulators, 0-8
+L(mid): and r4, r5, b_00001111
+ bic r5, r5, b_00001111
+ add s, r4, r5, lsr #4 C 4 8-bit accumulators
+ bne L(top)
+
+ usada8 r0, s, zero, r12
+POPC(` pop { r4-r9 } ')
+HAMD(` pop { r4-r11 } ')
+ bx r14
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm b/gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm
new file mode 100644
index 0000000..0fc4f13
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm
@@ -0,0 +1,544 @@
+dnl ARM v6 mpn_sqr_basecase.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C Code structure:
+C
+C
+C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4)
+C | | | |
+C | | | |
+C | | | |
+C \|/ \|/ \|/ \|/
+C ____________ ____________
+C / \ / \
+C \|/ \ \|/ \
+C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4)
+C \ /|\ \ /|\
+C \____________/ \____________/
+C \ /
+C \ /
+C \ /
+C cor3 cor2
+C \ /
+C \ /
+C sqr_diag_addlsh1
+
+C TODO
+C * Align more labels.
+C * Further tweak counter and updates in outer loops. (This could save
+C perhaps 5n cycles).
+C * Avoid sub-with-lsl in outer loops. We could keep n up-shifted, then
+C initialise loop counter i with a right shift.
+C * Try to use fewer register. Perhaps coalesce r9 branch target and n_saved.
+C (This could save 2-3 cycles for n > 4.)
+C * Optimise sqr_diag_addlsh1 loop. The current code uses old-style carry
+C propagation.
+C * Stop loops earlier suppressing writes of upper-most rp[] values.
+C * The addmul_2 loops here runs well on all cores, but mul_2 runs poorly
+C particularly on Cortex-A8.
+
+
+define(`rp', r0)
+define(`up', r1)
+define(`n', r2)
+
+define(`v0', r3)
+define(`v1', r6)
+define(`i', r8)
+define(`n_saved', r14)
+define(`cya', r11)
+define(`cyb', r12)
+define(`u0', r7)
+define(`u1', r9)
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+ and r12, n, #3
+ cmp n, #4
+ addgt r12, r12, #4
+ add pc, pc, r12, lsl #2
+ nop
+ b L(4)
+ b L(1)
+ b L(2)
+ b L(3)
+ b L(0m4)
+ b L(1m4)
+ b L(2m4)
+ b L(3m4)
+
+
+L(1m4): push {r4-r11, r14}
+ mov n_saved, n
+ sub i, n, #4
+ sub n, n, #2
+ add r10, pc, #L(am2_2m4)-.-8
+ ldm up, {v0,v1,u0}
+ sub up, up, #4
+ mov cyb, #0
+ mov r5, #0
+ umull r4, cya, v1, v0
+ str r4, [rp], #-12
+ mov r4, #0
+ b L(ko0)
+
+L(3m4): push {r4-r11, r14}
+ mov n_saved, n
+ sub i, n, #4
+ sub n, n, #2
+ add r10, pc, #L(am2_0m4)-.-8
+ ldm up, {v0,v1,u0}
+ add up, up, #4
+ mov cyb, #0
+ mov r5, #0
+ umull r4, cya, v1, v0
+ str r4, [rp], #-4
+ mov r4, #0
+ b L(ko2)
+
+L(2m4): push {r4-r11, r14}
+ mov n_saved, n
+ sub i, n, #4
+ sub n, n, #2
+ add r10, pc, #L(am2_3m4)-.-8
+ ldm up, {v0,v1,u1}
+ mov cyb, #0
+ mov r4, #0
+ umull r5, cya, v1, v0
+ str r5, [rp], #-8
+ mov r5, #0
+ b L(ko1)
+
+L(0m4): push {r4-r11, r14}
+ mov n_saved, n
+ sub i, n, #4
+ sub n, n, #2
+ add r10, pc, #L(am2_1m4)-.-8
+ ldm up, {v0,v1,u1}
+ mov cyb, #0
+ mov r4, #0
+ add up, up, #8
+ umull r5, cya, v1, v0
+ str r5, [rp, #0]
+ mov r5, #0
+
+L(top): ldr u0, [up, #4]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #4]
+ mov r4, #0
+ umaal r5, cyb, u1, v1
+L(ko2): ldr u1, [up, #8]
+ umaal r5, cya, u0, v0
+ str r5, [rp, #8]
+ mov r5, #0
+ umaal r4, cyb, u0, v1
+L(ko1): ldr u0, [up, #12]
+ umaal r4, cya, u1, v0
+ str r4, [rp, #12]
+ mov r4, #0
+ umaal r5, cyb, u1, v1
+L(ko0): ldr u1, [up, #16]!
+ umaal r5, cya, u0, v0
+ str r5, [rp, #16]!
+ mov r5, #0
+ umaal r4, cyb, u0, v1
+ subs i, i, #4
+ bhi L(top)
+
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #4]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #4]
+ umaal r5, cya, u0, v0
+ umaal cya, cyb, u0, v1
+ str r5, [rp, #8]
+ str cya, [rp, #12]
+ str cyb, [rp, #16]
+
+ add up, up, #4
+ sub n, n, #1
+ add rp, rp, #8
+ bx r10
+
+L(evnloop):
+ subs i, n, #6
+ sub n, n, #2
+ blt L(cor2)
+ ldm up, {v0,v1,u1}
+ add up, up, #8
+ mov cya, #0
+ mov cyb, #0
+ ldr r4, [rp, #-4]
+ umaal r4, cya, v1, v0
+ str r4, [rp, #-4]
+ ldr r4, [rp, #0]
+
+ ALIGN(16)
+L(ua2): ldr r5, [rp, #4]
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #4]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #0]
+ ldr r4, [rp, #8]
+ umaal r5, cya, u0, v0
+ ldr u1, [up, #8]
+ umaal r4, cyb, u0, v1
+ str r5, [rp, #4]
+ ldr r5, [rp, #12]
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #12]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #8]
+ ldr r4, [rp, #16]!
+ umaal r5, cya, u0, v0
+ ldr u1, [up, #16]!
+ umaal r4, cyb, u0, v1
+ str r5, [rp, #-4]
+ subs i, i, #4
+ bhs L(ua2)
+
+ umaal r4, cya, u1, v0
+ umaal cya, cyb, u1, v1
+ str r4, [rp, #0]
+ str cya, [rp, #4]
+ str cyb, [rp, #8]
+L(am2_0m4):
+ sub rp, rp, n, lsl #2
+ sub up, up, n, lsl #2
+ add rp, rp, #8
+
+ sub i, n, #4
+ sub n, n, #2
+ ldm up, {v0,v1,u1}
+ mov cya, #0
+ mov cyb, #0
+ ldr r4, [rp, #4]
+ umaal r4, cya, v1, v0
+ str r4, [rp, #4]
+ ldr r4, [rp, #8]
+ b L(lo0)
+
+ ALIGN(16)
+L(ua0): ldr r5, [rp, #4]
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #4]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #0]
+ ldr r4, [rp, #8]
+ umaal r5, cya, u0, v0
+ ldr u1, [up, #8]
+ umaal r4, cyb, u0, v1
+ str r5, [rp, #4]
+L(lo0): ldr r5, [rp, #12]
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #12]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #8]
+ ldr r4, [rp, #16]!
+ umaal r5, cya, u0, v0
+ ldr u1, [up, #16]!
+ umaal r4, cyb, u0, v1
+ str r5, [rp, #-4]
+ subs i, i, #4
+ bhs L(ua0)
+
+ umaal r4, cya, u1, v0
+ umaal cya, cyb, u1, v1
+ str r4, [rp, #0]
+ str cya, [rp, #4]
+ str cyb, [rp, #8]
+L(am2_2m4):
+ sub rp, rp, n, lsl #2
+ sub up, up, n, lsl #2
+ add rp, rp, #16
+ b L(evnloop)
+
+
+L(oddloop):
+ sub i, n, #5
+ sub n, n, #2
+ ldm up, {v0,v1,u0}
+ mov cya, #0
+ mov cyb, #0
+ ldr r5, [rp, #0]
+ umaal r5, cya, v1, v0
+ str r5, [rp, #0]
+ ldr r5, [rp, #4]
+ add up, up, #4
+ b L(lo1)
+
+ ALIGN(16)
+L(ua1): ldr r5, [rp, #4]
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #4]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #0]
+L(lo1): ldr r4, [rp, #8]
+ umaal r5, cya, u0, v0
+ ldr u1, [up, #8]
+ umaal r4, cyb, u0, v1
+ str r5, [rp, #4]
+ ldr r5, [rp, #12]
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #12]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #8]
+ ldr r4, [rp, #16]!
+ umaal r5, cya, u0, v0
+ ldr u1, [up, #16]!
+ umaal r4, cyb, u0, v1
+ str r5, [rp, #-4]
+ subs i, i, #4
+ bhs L(ua1)
+
+ umaal r4, cya, u1, v0
+ umaal cya, cyb, u1, v1
+ str r4, [rp, #0]
+ str cya, [rp, #4]
+ str cyb, [rp, #8]
+L(am2_3m4):
+ sub rp, rp, n, lsl #2
+ sub up, up, n, lsl #2
+ add rp, rp, #4
+
+ subs i, n, #3
+ beq L(cor3)
+ sub n, n, #2
+ ldm up, {v0,v1,u0}
+ mov cya, #0
+ mov cyb, #0
+ ldr r5, [rp, #8]
+ sub up, up, #4
+ umaal r5, cya, v1, v0
+ str r5, [rp, #8]
+ ldr r5, [rp, #12]
+ b L(lo3)
+
+ ALIGN(16)
+L(ua3): ldr r5, [rp, #4]
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #4]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #0]
+ ldr r4, [rp, #8]
+ umaal r5, cya, u0, v0
+ ldr u1, [up, #8]
+ umaal r4, cyb, u0, v1
+ str r5, [rp, #4]
+ ldr r5, [rp, #12]
+ umaal r4, cya, u1, v0
+ ldr u0, [up, #12]
+ umaal r5, cyb, u1, v1
+ str r4, [rp, #8]
+L(lo3): ldr r4, [rp, #16]!
+ umaal r5, cya, u0, v0
+ ldr u1, [up, #16]!
+ umaal r4, cyb, u0, v1
+ str r5, [rp, #-4]
+ subs i, i, #4
+ bhs L(ua3)
+
+ umaal r4, cya, u1, v0
+ umaal cya, cyb, u1, v1
+ str r4, [rp, #0]
+ str cya, [rp, #4]
+ str cyb, [rp, #8]
+L(am2_1m4):
+ sub rp, rp, n, lsl #2
+ sub up, up, n, lsl #2
+ add rp, rp, #12
+ b L(oddloop)
+
+
+L(cor3):ldm up, {v0,v1,u0}
+ ldr r5, [rp, #8]
+ mov cya, #0
+ mov cyb, #0
+ umaal r5, cya, v1, v0
+ str r5, [rp, #8]
+ ldr r5, [rp, #12]
+ ldr r4, [rp, #16]
+ umaal r5, cya, u0, v0
+ ldr u1, [up, #12]
+ umaal r4, cyb, u0, v1
+ str r5, [rp, #12]
+ umaal r4, cya, u1, v0
+ umaal cya, cyb, u1, v1
+ str r4, [rp, #16]
+ str cya, [rp, #20]
+ str cyb, [rp, #24]
+ add up, up, #16
+ mov cya, cyb
+ adds rp, rp, #36 C clear cy
+ mov cyb, #0
+ umaal cya, cyb, u1, u0
+ b L(sqr_diag_addlsh1)
+
+L(cor2):
+ ldm up!, {v0,v1,u0}
+ mov r4, cya
+ mov r5, cyb
+ mov cya, #0
+ umaal r4, cya, v1, v0
+ mov cyb, #0
+ umaal r5, cya, u0, v0
+ strd r4, r5, [rp, #-4]
+ umaal cya, cyb, u0, v1
+ add rp, rp, #16
+C b L(sqr_diag_addlsh1)
+
+
+define(`w0', r6)
+define(`w1', r7)
+define(`w2', r8)
+define(`rbx', r9)
+
+L(sqr_diag_addlsh1):
+ str cya, [rp, #-12]
+ str cyb, [rp, #-8]
+ sub n, n_saved, #1
+ sub up, up, n_saved, lsl #2
+ sub rp, rp, n_saved, lsl #3
+ ldr r3, [up], #4
+ umull w1, r5, r3, r3
+ mov w2, #0
+ mov r10, #0
+C cmn r0, #0 C clear cy (already clear)
+ b L(lm)
+
+L(tsd): adds w0, w0, rbx
+ adcs w1, w1, r4
+ str w0, [rp, #0]
+L(lm): ldr w0, [rp, #4]
+ str w1, [rp, #4]
+ ldr w1, [rp, #8]!
+ add rbx, r5, w2
+ adcs w0, w0, w0
+ ldr r3, [up], #4
+ adcs w1, w1, w1
+ adc w2, r10, r10
+ umull r4, r5, r3, r3
+ subs n, n, #1
+ bne L(tsd)
+
+ adds w0, w0, rbx
+ adcs w1, w1, r4
+ adc w2, r5, w2
+ stm rp, {w0,w1,w2}
+
+ pop {r4-r11, pc}
+
+
+C Straight line code for n <= 4
+
+L(1): ldr r3, [up, #0]
+ umull r1, r2, r3, r3
+ stm rp, {r1,r2}
+ bx r14
+
+L(2): push {r4-r5}
+ ldm up, {r5,r12}
+ umull r1, r2, r5, r5
+ umull r3, r4, r12, r12
+ umull r5, r12, r5, r12
+ adds r5, r5, r5
+ adcs r12, r12, r12
+ adc r4, r4, #0
+ adds r2, r2, r5
+ adcs r3, r3, r12
+ adc r4, r4, #0
+ stm rp, {r1,r2,r3,r4}
+ pop {r4-r5}
+ bx r14
+
+L(3): push {r4-r11}
+ ldm up, {r7,r8,r9}
+ umull r1, r2, r7, r7
+ umull r3, r4, r8, r8
+ umull r5, r6, r9, r9
+ umull r10, r11, r7, r8
+ mov r12, #0
+ umlal r11, r12, r7, r9
+ mov r7, #0
+ umlal r12, r7, r8, r9
+ adds r10, r10, r10
+ adcs r11, r11, r11
+ adcs r12, r12, r12
+ adcs r7, r7, r7
+ adc r6, r6, #0
+ adds r2, r2, r10
+ adcs r3, r3, r11
+ adcs r4, r4, r12
+ adcs r5, r5, r7
+ adc r6, r6, #0
+ stm rp, {r1,r2,r3,r4,r5,r6}
+ pop {r4-r11}
+ bx r14
+
+L(4): push {r4-r11, r14}
+ ldm up, {r9,r10,r11,r12}
+ umull r1, r2, r9, r9
+ umull r3, r4, r10, r10
+ umull r5, r6, r11, r11
+ umull r7, r8, r12, r12
+ stm rp, {r1,r2,r3,r4,r5,r6,r7}
+ umull r1, r2, r9, r10
+ mov r3, #0
+ umlal r2, r3, r9, r11
+ mov r4, #0
+ umlal r3, r4, r9, r12
+ mov r5, #0
+ umlal r3, r5, r10, r11
+ umaal r4, r5, r10, r12
+ mov r6, #0
+ umlal r5, r6, r11, r12
+ adds r1, r1, r1
+ adcs r2, r2, r2
+ adcs r3, r3, r3
+ adcs r4, r4, r4
+ adcs r5, r5, r5
+ adcs r6, r6, r6
+ add rp, rp, #4
+ adc r7, r8, #0
+ ldm rp, {r8,r9,r10,r11,r12,r14}
+ adds r1, r1, r8
+ adcs r2, r2, r9
+ adcs r3, r3, r10
+ adcs r4, r4, r11
+ adcs r5, r5, r12
+ adcs r6, r6, r14
+ adc r7, r7, #0
+ stm rp, {r1,r2,r3,r4,r5,r6,r7}
+ pop {r4-r11, pc}
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v6/submul_1.asm b/gmp-6.3.0/mpn/arm/v6/submul_1.asm
new file mode 100644
index 0000000..8a21733
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v6/submul_1.asm
@@ -0,0 +1,125 @@
+dnl ARM mpn_submul_1.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 3.75
+C Cortex-A15 4.0
+
+C This loop complements U on the fly,
+C U' = B^n - 1 - U
+C and then uses that
+C R - U*v = R + U'*v + v - B^n v
+
+C TODO
+C * Micro-optimise feed-in code.
+C * Optimise for n=1,2 by delaying register saving.
+C * Try using ldm/stm.
+
+define(`rp',`r0')
+define(`up',`r1')
+define(`n', `r2')
+define(`v0',`r3')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+ stmfd sp!, { r4, r5, r6, r7 }
+
+ ands r6, n, #3
+ mov r12, v0
+ beq L(fi0)
+ cmp r6, #2
+ bcc L(fi1)
+ beq L(fi2)
+
+L(fi3): ldr r4, [up], #12
+ mvn r4, r4
+ ldr r6, [rp, #0]
+ ldr r5, [up, #-8]
+ b L(lo3)
+
+L(fi0): ldr r5, [up], #16
+ mvn r5, r5
+ ldr r7, [rp], #4
+ ldr r4, [up, #-12]
+ b L(lo0)
+
+L(fi1): ldr r4, [up], #4
+ mvn r4, r4
+ ldr r6, [rp], #8
+ subs n, n, #1
+ beq L(1)
+ ldr r5, [up]
+ b L(lo1)
+
+L(fi2): ldr r5, [up], #8
+ mvn r5, r5
+ ldr r7, [rp], #12
+ ldr r4, [up, #-4]
+ b L(lo2)
+
+ ALIGN(16)
+L(top): ldr r6, [rp, #-8]
+ ldr r5, [up]
+ str r7, [rp, #-12]
+L(lo1): umaal r6, r12, r4, v0
+ add up, up, #16
+ mvn r5, r5
+ ldr r7, [rp, #-4]
+ ldr r4, [up, #-12]
+ str r6, [rp, #-8]
+L(lo0): umaal r7, r12, r5, v0
+ mvn r4, r4
+ ldr r6, [rp, #0]
+ ldr r5, [up, #-8]
+ str r7, [rp, #-4]
+L(lo3): umaal r6, r12, r4, v0
+ mvn r5, r5
+ ldr r7, [rp, #4]
+ ldr r4, [up, #-4]
+ str r6, [rp], #16
+L(lo2): umaal r7, r12, r5, v0
+ mvn r4, r4
+ subs n, n, #4
+ bhi L(top)
+
+ ldr r6, [rp, #-8]
+ str r7, [rp, #-12]
+L(1): umaal r6, r12, r4, v0
+ str r6, [rp, #-8]
+ sub r0, v0, r12
+ ldmfd sp!, { r4, r5, r6, r7 }
+ bx lr
+EPILOGUE()