aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/arm/neon
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/arm/neon')
-rw-r--r--gmp-6.3.0/mpn/arm/neon/README2
-rw-r--r--gmp-6.3.0/mpn/arm/neon/hamdist.asm194
-rw-r--r--gmp-6.3.0/mpn/arm/neon/lorrshift.asm279
-rw-r--r--gmp-6.3.0/mpn/arm/neon/lshiftc.asm242
-rw-r--r--gmp-6.3.0/mpn/arm/neon/popcount.asm166
-rw-r--r--gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm140
6 files changed, 1023 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/arm/neon/README b/gmp-6.3.0/mpn/arm/neon/README
new file mode 100644
index 0000000..79e3b48
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/neon/README
@@ -0,0 +1,2 @@
+This directory contains Neon code which runs and is efficient on all
+ARM CPUs which support Neon.
diff --git a/gmp-6.3.0/mpn/arm/neon/hamdist.asm b/gmp-6.3.0/mpn/arm/neon/hamdist.asm
new file mode 100644
index 0000000..2320896
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/neon/hamdist.asm
@@ -0,0 +1,194 @@
+dnl ARM Neon mpn_hamdist -- mpn bit hamming distance.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 1.89
+C Cortex-A15 0.95
+
+C TODO
+C * Explore using vldr and vldm. Does it help on A9? (These loads do
+C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for
+C popcount. Except perhaps also for popcount for the edge loads.)
+C * Arrange to align the pointer, if that helps performance. Use the same
+C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
+C valgrind!)
+C * Explore if explicit align directives, e.g., "[ptr:128]" help.
+C * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
+
+C INPUT PARAMETERS
+define(`ap', r0)
+define(`bp', r1)
+define(`n', r2)
+
+C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
+C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or
+C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which
+C can be represented as a 8-bit ARM constant.
+C
+define(`chunksize',0x3f80)
+
+ASM_START()
+PROLOGUE(mpn_hamdist)
+
+ cmp n, #chunksize
+ bhi L(gt16k)
+
+L(lt16k):
+ vmov.i64 q8, #0 C clear summation register
+ vmov.i64 q9, #0 C clear summation register
+
+ tst n, #1
+ beq L(xxx0)
+ vmov.i64 d0, #0
+ vmov.i64 d20, #0
+ sub n, n, #1
+ vld1.32 {d0[0]}, [ap]! C load 1 limb
+ vld1.32 {d20[0]}, [bp]! C load 1 limb
+ veor d0, d0, d20
+ vcnt.8 d24, d0
+ vpadal.u8 d16, d24 C d16/q8 = 0; could just splat
+
+L(xxx0):tst n, #2
+ beq L(xx00)
+ sub n, n, #2
+ vld1.32 {d0}, [ap]! C load 2 limbs
+ vld1.32 {d20}, [bp]! C load 2 limbs
+ veor d0, d0, d20
+ vcnt.8 d24, d0
+ vpadal.u8 d16, d24
+
+L(xx00):tst n, #4
+ beq L(x000)
+ sub n, n, #4
+ vld1.32 {q0}, [ap]! C load 4 limbs
+ vld1.32 {q10}, [bp]! C load 4 limbs
+ veor q0, q0, q10
+ vcnt.8 q12, q0
+ vpadal.u8 q8, q12
+
+L(x000):tst n, #8
+ beq L(0000)
+
+ subs n, n, #8
+ vld1.32 {q0,q1}, [ap]! C load 8 limbs
+ vld1.32 {q10,q11}, [bp]! C load 8 limbs
+ bls L(sum)
+
+L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs
+ vld1.32 {q14,q15}, [bp]! C load 8 limbs
+ veor q0, q0, q10
+ veor q1, q1, q11
+ sub n, n, #8
+ vcnt.8 q12, q0
+ vcnt.8 q13, q1
+ b L(mid)
+
+L(0000):subs n, n, #16
+ blo L(e0)
+
+ vld1.32 {q2,q3}, [ap]! C load 8 limbs
+ vld1.32 {q0,q1}, [ap]! C load 8 limbs
+ vld1.32 {q14,q15}, [bp]! C load 8 limbs
+ vld1.32 {q10,q11}, [bp]! C load 8 limbs
+ veor q2, q2, q14
+ veor q3, q3, q15
+ vcnt.8 q12, q2
+ vcnt.8 q13, q3
+ subs n, n, #16
+ blo L(end)
+
+L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs
+ vld1.32 {q14,q15}, [bp]! C load 8 limbs
+ veor q0, q0, q10
+ veor q1, q1, q11
+ vpadal.u8 q8, q12
+ vcnt.8 q12, q0
+ vpadal.u8 q9, q13
+ vcnt.8 q13, q1
+L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs
+ vld1.32 {q10,q11}, [bp]! C load 8 limbs
+ veor q2, q2, q14
+ veor q3, q3, q15
+ subs n, n, #16
+ vpadal.u8 q8, q12
+ vcnt.8 q12, q2
+ vpadal.u8 q9, q13
+ vcnt.8 q13, q3
+ bhs L(top)
+
+L(end): vpadal.u8 q8, q12
+ vpadal.u8 q9, q13
+L(sum): veor q0, q0, q10
+ veor q1, q1, q11
+ vcnt.8 q12, q0
+ vcnt.8 q13, q1
+ vpadal.u8 q8, q12
+ vpadal.u8 q9, q13
+ vadd.i16 q8, q8, q9
+ C we have 8 16-bit counts
+L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts
+ vpaddl.u32 q8, q8 C we have 2 64-bit counts
+ vmov.32 r0, d16[0]
+ vmov.32 r1, d17[0]
+ add r0, r0, r1
+ bx lr
+
+C Code for large count. Splits operand and calls above code.
+define(`ap2', r5)
+define(`bp2', r6)
+L(gt16k):
+ push {r4,r5,r6,r14}
+ mov ap2, ap
+ mov bp2, bp
+ mov r3, n C full count
+ mov r4, #0 C total sum
+
+1: mov n, #chunksize C count for this invocation
+ bl L(lt16k) C could jump deep inside code
+ add ap2, ap2, #chunksize*4 C point at next chunk
+ add bp2, bp2, #chunksize*4 C point at next chunk
+ add r4, r4, r0
+ mov ap, ap2 C put chunk pointer in place for call
+ mov bp, bp2 C put chunk pointer in place for call
+ sub r3, r3, #chunksize
+ cmp r3, #chunksize
+ bhi 1b
+
+ mov n, r3 C count for final invocation
+ bl L(lt16k)
+ add r0, r4, r0
+ pop {r4,r5,r6,pc}
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/neon/lorrshift.asm b/gmp-6.3.0/mpn/arm/neon/lorrshift.asm
new file mode 100644
index 0000000..7ebc780
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/neon/lorrshift.asm
@@ -0,0 +1,279 @@
+dnl ARM Neon mpn_lshift and mpn_rshift.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C StrongARM - -
+C XScale - -
+C Cortex-A7 ? ?
+C Cortex-A8 ? ?
+C Cortex-A9 3 3 Y
+C Cortex-A15 1.5 1.5 Y
+
+
+C We read 64 bits at a time at 32-bit aligned addresses, and except for the
+C first and last store, we write using 64-bit aligned addresses. All shifting
+C is done on 64-bit words in 'extension' registers.
+C
+C It should be possible to read also using 64-bit alignment, by manipulating
+C the shift count for unaligned operands. Not done, since it does not seem to
+C matter for A9 or A15.
+C
+C This will not work in big-endian mode.
+
+C TODO
+C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts,
+C which might make it tricky.
+C * Clean up and simplify.
+C * Consider sharing most of the code for lshift and rshift, since the feed-in
+C code, the loop, and most of the wind-down code are identical.
+C * Replace the basecase code with code using 'extension' registers.
+C * Optimise. It is not clear that this loop insn permutation is optimal for
+C either A9 or A15.
+
+C INPUT PARAMETERS
+define(`rp', `r0')
+define(`ap', `r1')
+define(`n', `r2')
+define(`cnt', `r3')
+
+ifdef(`OPERATION_lshift',`
+ define(`IFLSH', `$1')
+ define(`IFRSH', `')
+ define(`X',`0')
+ define(`Y',`1')
+ define(`func',`mpn_lshift')
+')
+ifdef(`OPERATION_rshift',`
+ define(`IFLSH', `')
+ define(`IFRSH', `$1')
+ define(`X',`1')
+ define(`Y',`0')
+ define(`func',`mpn_rshift')
+')
+
+MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
+
+ASM_START(neon)
+ TEXT
+ ALIGN(64)
+PROLOGUE(func)
+IFLSH(` mov r12, n, lsl #2 ')
+IFLSH(` add rp, rp, r12 ')
+IFLSH(` add ap, ap, r12 ')
+
+ cmp n, #4 C SIMD code n limit
+ ble L(base)
+
+ifdef(`OPERATION_lshift',`
+ vdup.32 d6, r3 C left shift count is positive
+ sub r3, r3, #64 C right shift count is negative
+ vdup.32 d7, r3
+ mov r12, #-8') C lshift pointer update offset
+ifdef(`OPERATION_rshift',`
+ rsb r3, r3, #0 C right shift count is negative
+ vdup.32 d6, r3
+ add r3, r3, #64 C left shift count is positive
+ vdup.32 d7, r3
+ mov r12, #8') C rshift pointer update offset
+
+IFLSH(` sub ap, ap, #8 ')
+ vld1.32 {d19}, [ap], r12 C load initial 2 limbs
+ vshl.u64 d18, d19, d7 C retval
+
+ tst rp, #4 C is rp 64-bit aligned already?
+ beq L(rp_aligned) C yes, skip
+IFLSH(` add ap, ap, #4 ') C move back ap pointer
+IFRSH(` sub ap, ap, #4 ') C move back ap pointer
+ vshl.u64 d4, d19, d6
+ sub n, n, #1 C first limb handled
+IFLSH(` sub rp, rp, #4 ')
+ vst1.32 {d4[Y]}, [rp]IFRSH(!) C store first limb, rp gets aligned
+ vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2]
+
+L(rp_aligned):
+IFLSH(` sub rp, rp, #8 ')
+ subs n, n, #6
+ blt L(two_or_three_more)
+ tst n, #2
+ beq L(2)
+
+L(1): vld1.32 {d17}, [ap], r12
+ vshl.u64 d5, d19, d6
+ vld1.32 {d16}, [ap], r12
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ sub n, n, #2
+ b L(mid)
+
+L(2): vld1.32 {d16}, [ap], r12
+ vshl.u64 d4, d19, d6
+ vld1.32 {d17}, [ap], r12
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ subs n, n, #4
+ blt L(end)
+
+L(top): vld1.32 {d16}, [ap], r12
+ vorr d2, d4, d1
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ vst1.32 {d2}, [rp:64], r12
+L(mid): vld1.32 {d17}, [ap], r12
+ vorr d3, d5, d0
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ vst1.32 {d3}, [rp:64], r12
+ subs n, n, #4
+ bge L(top)
+
+L(end): tst n, #1
+ beq L(evn)
+
+ vorr d2, d4, d1
+ vst1.32 {d2}, [rp:64], r12
+ b L(cj1)
+
+L(evn): vorr d2, d4, d1
+ vshl.u64 d0, d17, d7
+ vshl.u64 d16, d17, d6
+ vst1.32 {d2}, [rp:64], r12
+ vorr d2, d5, d0
+ b L(cj2)
+
+C Load last 2 - 3 limbs, store last 4 - 5 limbs
+L(two_or_three_more):
+ tst n, #1
+ beq L(l2)
+
+L(l3): vshl.u64 d5, d19, d6
+ vld1.32 {d17}, [ap], r12
+L(cj1): veor d16, d16, d16
+IFLSH(` add ap, ap, #4 ')
+ vld1.32 {d16[Y]}, [ap], r12
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ vorr d3, d5, d0
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ vst1.32 {d3}, [rp:64], r12
+ vorr d2, d4, d1
+ vst1.32 {d2}, [rp:64], r12
+IFLSH(` add rp, rp, #4 ')
+ vst1.32 {d5[Y]}, [rp]
+ vmov.32 r0, d18[X]
+ bx lr
+
+L(l2): vld1.32 {d16}, [ap], r12
+ vshl.u64 d4, d19, d6
+ vshl.u64 d1, d16, d7
+ vshl.u64 d16, d16, d6
+ vorr d2, d4, d1
+L(cj2): vst1.32 {d2}, [rp:64], r12
+ vst1.32 {d16}, [rp]
+ vmov.32 r0, d18[X]
+ bx lr
+
+
+define(`tnc', `r12')
+L(base):
+ push {r4, r6, r7, r8}
+ifdef(`OPERATION_lshift',`
+ ldr r4, [ap, #-4]!
+ rsb tnc, cnt, #32
+
+ mov r7, r4, lsl cnt
+ tst n, #1
+ beq L(ev) C n even
+
+L(od): subs n, n, #2
+ bcc L(ed1) C n = 1
+ ldr r8, [ap, #-4]!
+ b L(md) C n = 3
+
+L(ev): ldr r6, [ap, #-4]!
+ subs n, n, #2
+ beq L(ed) C n = 3
+ C n = 4
+L(tp): ldr r8, [ap, #-4]!
+ orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]!
+ mov r7, r6, lsl cnt
+L(md): ldr r6, [ap, #-4]!
+ orr r7, r7, r8, lsr tnc
+ str r7, [rp, #-4]!
+ mov r7, r8, lsl cnt
+
+L(ed): orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]!
+ mov r7, r6, lsl cnt
+L(ed1): str r7, [rp, #-4]
+ mov r0, r4, lsr tnc
+')
+ifdef(`OPERATION_rshift',`
+ ldr r4, [ap]
+ rsb tnc, cnt, #32
+
+ mov r7, r4, lsr cnt
+ tst n, #1
+ beq L(ev) C n even
+
+L(od): subs n, n, #2
+ bcc L(ed1) C n = 1
+ ldr r8, [ap, #4]!
+ b L(md) C n = 3
+
+L(ev): ldr r6, [ap, #4]!
+ subs n, n, #2
+ beq L(ed) C n = 2
+ C n = 4
+
+L(tp): ldr r8, [ap, #4]!
+ orr r7, r7, r6, lsl tnc
+ str r7, [rp], #4
+ mov r7, r6, lsr cnt
+L(md): ldr r6, [ap, #4]!
+ orr r7, r7, r8, lsl tnc
+ str r7, [rp], #4
+ mov r7, r8, lsr cnt
+
+L(ed): orr r7, r7, r6, lsl tnc
+ str r7, [rp], #4
+ mov r7, r6, lsr cnt
+L(ed1): str r7, [rp], #4
+ mov r0, r4, lsl tnc
+')
+ pop {r4, r6, r7, r8}
+ bx r14
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/neon/lshiftc.asm b/gmp-6.3.0/mpn/arm/neon/lshiftc.asm
new file mode 100644
index 0000000..f1bf0de
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/neon/lshiftc.asm
@@ -0,0 +1,242 @@
+dnl ARM Neon mpn_lshiftc.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C StrongARM - -
+C XScale - -
+C Cortex-A7 ? ?
+C Cortex-A8 ? ?
+C Cortex-A9 3.5 3.5 Y
+C Cortex-A15 1.75 1.75 Y
+
+
+C We read 64 bits at a time at 32-bit aligned addresses, and except for the
+C first and last store, we write using 64-bit aligned addresses. All shifting
+C is done on 64-bit words in 'extension' registers.
+C
+C It should be possible to read also using 64-bit alignment, by manipulating
+C the shift count for unaligned operands. Not done, since it does not seem to
+C matter for A9 or A15.
+C
+C This will not work in big-endian mode.
+
+C TODO
+C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts,
+C which might make it tricky.
+C * Clean up and simplify.
+C * Consider sharing most of the code for lshift and rshift, since the feed-in
+C code, the loop, and most of the wind-down code are identical.
+C * Replace the basecase code with code using 'extension' registers.
+C * Optimise. It is not clear that this loop insn permutation is optimal for
+C either A9 or A15.
+
+C INPUT PARAMETERS
+define(`rp', `r0')
+define(`ap', `r1')
+define(`n', `r2')
+define(`cnt', `r3')
+
+ASM_START(neon)
+ TEXT
+ ALIGN(64)
+PROLOGUE(mpn_lshiftc)
+ mov r12, n, lsl #2
+ add rp, rp, r12
+ add ap, ap, r12
+
+ cmp n, #4 C SIMD code n limit
+ ble L(base)
+
+ vdup.32 d6, r3 C left shift count is positive
+ sub r3, r3, #64 C right shift count is negative
+ vdup.32 d7, r3
+ mov r12, #-8 C lshift pointer update offset
+
+ sub ap, ap, #8
+ vld1.32 {d19}, [ap], r12 C load initial 2 limbs
+ vshl.u64 d18, d19, d7 C retval
+
+ tst rp, #4 C is rp 64-bit aligned already?
+ beq L(rp_aligned) C yes, skip
+ vmvn d19, d19
+ add ap, ap, #4 C move back ap pointer
+ vshl.u64 d4, d19, d6
+ sub n, n, #1 C first limb handled
+ sub rp, rp, #4
+ vst1.32 {d4[1]}, [rp] C store first limb, rp gets aligned
+ vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2]
+
+L(rp_aligned):
+ sub rp, rp, #8
+ subs n, n, #6
+ vmvn d19, d19
+ blt L(two_or_three_more)
+ tst n, #2
+ beq L(2)
+
+L(1): vld1.32 {d17}, [ap], r12
+ vshl.u64 d5, d19, d6
+ vmvn d17, d17
+ vld1.32 {d16}, [ap], r12
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ sub n, n, #2
+ b L(mid)
+
+L(2): vld1.32 {d16}, [ap], r12
+ vshl.u64 d4, d19, d6
+ vmvn d16, d16
+ vld1.32 {d17}, [ap], r12
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ subs n, n, #4
+ blt L(end)
+
+L(top): vmvn d17, d17
+ vld1.32 {d16}, [ap], r12
+ vorr d2, d4, d1
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ vst1.32 {d2}, [rp:64], r12
+L(mid): vmvn d16, d16
+ vld1.32 {d17}, [ap], r12
+ vorr d3, d5, d0
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ vst1.32 {d3}, [rp:64], r12
+ subs n, n, #4
+ bge L(top)
+
+L(end): tst n, #1
+ beq L(evn)
+
+ vorr d2, d4, d1
+ vst1.32 {d2}, [rp:64], r12
+ b L(cj1)
+
+L(evn): vmvn d17, d17
+ vorr d2, d4, d1
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ vst1.32 {d2}, [rp:64], r12
+ vmov.u8 d17, #255
+ vorr d2, d5, d0
+ vshl.u64 d0, d17, d7
+ vorr d3, d4, d0
+ b L(cj2)
+
+C Load last 2 - 3 limbs, store last 4 - 5 limbs
+L(two_or_three_more):
+ tst n, #1
+ beq L(l2)
+
+L(l3): vshl.u64 d5, d19, d6
+ vld1.32 {d17}, [ap], r12
+L(cj1): vmov.u8 d16, #0
+ add ap, ap, #4
+ vmvn d17, d17
+ vld1.32 {d16[1]}, [ap], r12
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ vmvn d16, d16
+ vorr d3, d5, d0
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ vst1.32 {d3}, [rp:64], r12
+ vorr d2, d4, d1
+ vst1.32 {d2}, [rp:64], r12
+ add rp, rp, #4
+ vst1.32 {d5[1]}, [rp]
+ vmov.32 r0, d18[0]
+ bx lr
+
+L(l2): vld1.32 {d16}, [ap], r12
+ vshl.u64 d4, d19, d6
+ vmvn d16, d16
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ vmov.u8 d17, #255
+ vorr d2, d4, d1
+ vshl.u64 d0, d17, d7
+ vorr d3, d5, d0
+L(cj2): vst1.32 {d2}, [rp:64], r12
+ vst1.32 {d3}, [rp]
+ vmov.32 r0, d18[0]
+ bx lr
+
+
+define(`tnc', `r12')
+L(base):
+ push {r4, r6, r7, r8}
+ ldr r4, [ap, #-4]!
+ rsb tnc, cnt, #32
+ mvn r6, r4
+
+ mov r7, r6, lsl cnt
+ tst n, #1
+ beq L(ev) C n even
+
+L(od): subs n, n, #2
+ bcc L(ed1) C n = 1
+ ldr r8, [ap, #-4]!
+ mvn r8, r8
+ b L(md) C n = 3
+
+L(ev): ldr r6, [ap, #-4]!
+ mvn r6, r6
+ subs n, n, #2
+ beq L(ed) C n = 3
+ C n = 4
+L(tp): ldr r8, [ap, #-4]!
+ orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]!
+ mvn r8, r8
+ mov r7, r6, lsl cnt
+L(md): ldr r6, [ap, #-4]!
+ orr r7, r7, r8, lsr tnc
+ str r7, [rp, #-4]!
+ mvn r6, r6
+ mov r7, r8, lsl cnt
+
+L(ed): orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]!
+ mov r7, r6, lsl cnt
+L(ed1): mvn r6, #0
+ orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]
+ mov r0, r4, lsr tnc
+ pop {r4, r6, r7, r8}
+ bx r14
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/neon/popcount.asm b/gmp-6.3.0/mpn/arm/neon/popcount.asm
new file mode 100644
index 0000000..2f8f9af
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/neon/popcount.asm
@@ -0,0 +1,166 @@
+dnl ARM Neon mpn_popcount -- mpn bit population count.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM: -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 1.125
+C Cortex-A15 0.56
+
+C TODO
+C * Explore using vldr and vldm. Does it help on A9? (These loads do
+C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for
+C popcount. Except perhaps also for popcount for the edge loads.)
+C * Arrange to align the pointer, if that helps performance. Use the same
+C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
+C valgrind!)
+C * Explore if explicit align directives, e.g., "[ptr:128]" help.
+C * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
+
+C INPUT PARAMETERS
+define(`ap', r0)
+define(`n', r1)
+
+C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end
+C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or
+C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which
+C can be represented as a 8-bit ARM constant.
+C
+define(`chunksize',0x3f80)
+
+ASM_START()
+PROLOGUE(mpn_popcount)
+
+ cmp n, #chunksize
+ bhi L(gt16k)
+
+L(lt16k):
+ vmov.i64 q8, #0 C clear summation register
+ vmov.i64 q9, #0 C clear summation register
+
+ tst n, #1
+ beq L(xxx0)
+ vmov.i64 d0, #0
+ sub n, n, #1
+ vld1.32 {d0[0]}, [ap]! C load 1 limb
+ vcnt.8 d24, d0
+ vpadal.u8 d16, d24 C d16/q8 = 0; could just splat
+
+L(xxx0):tst n, #2
+ beq L(xx00)
+ sub n, n, #2
+ vld1.32 {d0}, [ap]! C load 2 limbs
+ vcnt.8 d24, d0
+ vpadal.u8 d16, d24
+
+L(xx00):tst n, #4
+ beq L(x000)
+ sub n, n, #4
+ vld1.32 {q0}, [ap]! C load 4 limbs
+ vcnt.8 q12, q0
+ vpadal.u8 q8, q12
+
+L(x000):tst n, #8
+ beq L(0000)
+
+ subs n, n, #8
+ vld1.32 {q0,q1}, [ap]! C load 8 limbs
+ bls L(sum)
+
+L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs
+ sub n, n, #8
+ vcnt.8 q12, q0
+ vcnt.8 q13, q1
+ b L(mid)
+
+L(0000):subs n, n, #16
+ blo L(e0)
+
+ vld1.32 {q2,q3}, [ap]! C load 8 limbs
+ vld1.32 {q0,q1}, [ap]! C load 8 limbs
+ vcnt.8 q12, q2
+ vcnt.8 q13, q3
+ subs n, n, #16
+ blo L(end)
+
+L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs
+ vpadal.u8 q8, q12
+ vcnt.8 q12, q0
+ vpadal.u8 q9, q13
+ vcnt.8 q13, q1
+L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs
+ subs n, n, #16
+ vpadal.u8 q8, q12
+ vcnt.8 q12, q2
+ vpadal.u8 q9, q13
+ vcnt.8 q13, q3
+ bhs L(top)
+
+L(end): vpadal.u8 q8, q12
+ vpadal.u8 q9, q13
+L(sum): vcnt.8 q12, q0
+ vcnt.8 q13, q1
+ vpadal.u8 q8, q12
+ vpadal.u8 q9, q13
+ vadd.i16 q8, q8, q9
+ C we have 8 16-bit counts
+L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts
+ vpaddl.u32 q8, q8 C we have 2 64-bit counts
+ vmov.32 r0, d16[0]
+ vmov.32 r1, d17[0]
+ add r0, r0, r1
+ bx lr
+
+C Code for large count. Splits operand and calls above code.
+define(`ap2', r2) C caller-saves reg not used above
+L(gt16k):
+ push {r4,r14}
+ mov ap2, ap
+ mov r3, n C full count
+ mov r4, #0 C total sum
+
+1: mov n, #chunksize C count for this invocation
+ bl L(lt16k) C could jump deep inside code
+ add ap2, ap2, #chunksize*4 C point at next chunk
+ add r4, r4, r0
+ mov ap, ap2 C put chunk pointer in place for call
+ sub r3, r3, #chunksize
+ cmp r3, #chunksize
+ bhi 1b
+
+ mov n, r3 C count for final invocation
+ bl L(lt16k)
+ add r0, r4, r0
+ pop {r4,pc}
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm b/gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm
new file mode 100644
index 0000000..69fceb0
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm
@@ -0,0 +1,140 @@
+dnl ARM Neon mpn_sec_tabselect.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2011-2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 1.15
+C Cortex-A15 0.65
+
+define(`rp', `r0')
+define(`tp', `r1')
+define(`n', `r2')
+define(`nents', `r3')
+C define(`which', on stack)
+
+define(`i', `r4')
+define(`j', `r5')
+
+define(`maskq', `q10')
+define(`maskd', `d20')
+
+ASM_START()
+PROLOGUE(mpn_sec_tabselect)
+ push {r4-r5}
+
+ add r4, sp, #8
+ vld1.32 {d30[], d31[]}, [r4] C 4 `which' copies
+ vmov.i32 q14, #1 C 4 copies of 1
+
+ subs j, n, #8
+ bmi L(outer_end)
+
+L(outer_top):
+ mov i, nents
+ mov r12, tp C preserve tp
+ veor q13, q13, q13 C 4 counter copies
+ veor q2, q2, q2
+ veor q3, q3, q3
+ ALIGN(16)
+L(top): vceq.i32 maskq, q13, q15 C compare idx copies to `which' copies
+ vld1.32 {q0,q1}, [tp]
+ vadd.i32 q13, q13, q14
+ vbit q2, q0, maskq
+ vbit q3, q1, maskq
+ add tp, tp, n, lsl #2
+ subs i, i, #1
+ bne L(top)
+ vst1.32 {q2,q3}, [rp]!
+ add tp, r12, #32 C restore tp, point to next slice
+ subs j, j, #8
+ bpl L(outer_top)
+L(outer_end):
+
+ tst n, #4
+ beq L(b0xx)
+L(b1xx):mov i, nents
+ mov r12, tp
+ veor q13, q13, q13
+ veor q2, q2, q2
+ ALIGN(16)
+L(tp4): vceq.i32 maskq, q13, q15
+ vld1.32 {q0}, [tp]
+ vadd.i32 q13, q13, q14
+ vbit q2, q0, maskq
+ add tp, tp, n, lsl #2
+ subs i, i, #1
+ bne L(tp4)
+ vst1.32 {q2}, [rp]!
+ add tp, r12, #16
+
+L(b0xx):tst n, #2
+ beq L(b00x)
+L(b01x):mov i, nents
+ mov r12, tp
+ veor d26, d26, d26
+ veor d4, d4, d4
+ ALIGN(16)
+L(tp2): vceq.i32 maskd, d26, d30
+ vld1.32 {d0}, [tp]
+ vadd.i32 d26, d26, d28
+ vbit d4, d0, maskd
+ add tp, tp, n, lsl #2
+ subs i, i, #1
+ bne L(tp2)
+ vst1.32 {d4}, [rp]!
+ add tp, r12, #8
+
+L(b00x):tst n, #1
+ beq L(b000)
+L(b001):mov i, nents
+ mov r12, tp
+ veor d26, d26, d26
+ veor d4, d4, d4
+ ALIGN(16)
+L(tp1): vceq.i32 maskd, d26, d30
+ vld1.32 {d0[0]}, [tp]
+ vadd.i32 d26, d26, d28
+ vbit d4, d0, maskd
+ add tp, tp, n, lsl #2
+ subs i, i, #1
+ bne L(tp1)
+ vst1.32 {d4[0]}, [rp]
+
+L(b000):pop {r4-r5}
+ bx r14
+EPILOGUE()