aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/arm/v7a/cora15/neon
diff options
context:
space:
mode:
authorDuncan Wilkie <antigravityd@gmail.com>2023-11-18 06:11:09 -0600
committerDuncan Wilkie <antigravityd@gmail.com>2023-11-18 06:11:09 -0600
commit11da511c784eca003deb90c23570f0873954e0de (patch)
treee14fdd3d5d6345956d67e79ae771d0633d28362b /gmp-6.3.0/mpn/arm/v7a/cora15/neon
Initial commit.
Diffstat (limited to 'gmp-6.3.0/mpn/arm/v7a/cora15/neon')
-rw-r--r--gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm43
-rw-r--r--gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm43
-rw-r--r--gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm144
-rw-r--r--gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm97
-rw-r--r--gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm110
-rw-r--r--gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm90
-rw-r--r--gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm177
7 files changed, 704 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm
new file mode 100644
index 0000000..d8cfe3f
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm
@@ -0,0 +1,43 @@
+dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 1)
+
+ifdef(`OPERATION_addlsh1_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n)
+
+include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm
new file mode 100644
index 0000000..b48204d
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm
@@ -0,0 +1,43 @@
+dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+define(LSH, 2)
+
+ifdef(`OPERATION_addlsh2_n',`define(`DO_add')')
+ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')')
+ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')')
+
+MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n)
+
+include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm')
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm
new file mode 100644
index 0000000..51f93c1
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm
@@ -0,0 +1,144 @@
+dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+C cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 5.25
+C Cortex-A15 2.25
+
+C TODO
+C * Consider using 4-way feed-in code.
+C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps
+C insufficiently for A7 and A8.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n', `r3')
+
+ifdef(`DO_add', `
+ define(`ADCSBCS', `adcs $1, $2, $3')
+ define(`CLRCY', `cmn r13, #1')
+ define(`RETVAL', `adc r0, $1, #0')
+ define(`func', mpn_addlsh`'LSH`'_n)')
+ifdef(`DO_sub', `
+ define(`ADCSBCS', `sbcs $1, $2, $3')
+ define(`CLRCY', `cmp r13, #0')
+ define(`RETVAL', `sbc $2, $2, $2
+ cmn $2, #1
+ adc r0, $1, #0')
+ define(`func', mpn_sublsh`'LSH`'_n)')
+ifdef(`DO_rsb', `
+ define(`ADCSBCS', `sbcs $1, $3, $2')
+ define(`CLRCY', `cmp r13, #0')
+ define(`RETVAL', `sbc r0, $1, #0')
+ define(`func', mpn_rsblsh`'LSH`'_n)')
+
+
+ASM_START()
+PROLOGUE(func)
+ push {r4-r10}
+ vmov.i8 d0, #0 C could feed carry through here
+ CLRCY
+ tst n, #1
+ beq L(bb0)
+
+L(bb1): vld1.32 {d3[0]}, [vp]!
+ vsli.u32 d0, d3, #LSH
+ ldr r12, [up], #4
+ vmov.32 r5, d0[0]
+ vshr.u32 d0, d3, #32-LSH
+ ADCSBCS( r12, r12, r5)
+ str r12, [rp], #4
+ bics n, n, #1
+ beq L(rtn)
+
+L(bb0): tst n, #2
+ beq L(b00)
+
+L(b10): vld1.32 {d3}, [vp]!
+ vsli.u64 d0, d3, #LSH
+ ldmia up!, {r10,r12}
+ vmov r4, r5, d0
+ vshr.u64 d0, d3, #64-LSH
+ ADCSBCS( r10, r10, r4)
+ ADCSBCS( r12, r12, r5)
+ stmia rp!, {r10,r12}
+ bics n, n, #2
+ beq L(rtn)
+
+L(b00): vld1.32 {d2}, [vp]!
+ vsli.u64 d0, d2, #LSH
+ vshr.u64 d1, d2, #64-LSH
+ vld1.32 {d3}, [vp]!
+ vsli.u64 d1, d3, #LSH
+ vmov r6, r7, d0
+ vshr.u64 d0, d3, #64-LSH
+ sub n, n, #4
+ tst n, n
+ beq L(end)
+
+ ALIGN(16)
+L(top): ldmia up!, {r8,r9,r10,r12}
+ vld1.32 {d2}, [vp]!
+ vsli.u64 d0, d2, #LSH
+ vmov r4, r5, d1
+ vshr.u64 d1, d2, #64-LSH
+ ADCSBCS( r8, r8, r6)
+ ADCSBCS( r9, r9, r7)
+ vld1.32 {d3}, [vp]!
+ vsli.u64 d1, d3, #LSH
+ vmov r6, r7, d0
+ vshr.u64 d0, d3, #64-LSH
+ ADCSBCS( r10, r10, r4)
+ ADCSBCS( r12, r12, r5)
+ stmia rp!, {r8,r9,r10,r12}
+ sub n, n, #4
+ tst n, n
+ bne L(top)
+
+L(end): ldmia up!, {r8,r9,r10,r12}
+ vmov r4, r5, d1
+ ADCSBCS( r8, r8, r6)
+ ADCSBCS( r9, r9, r7)
+ ADCSBCS( r10, r10, r4)
+ ADCSBCS( r12, r12, r5)
+ stmia rp!, {r8,r9,r10,r12}
+L(rtn): vmov.32 r0, d0[0]
+ RETVAL( r0, r1)
+ pop {r4-r10}
+ bx r14
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm
new file mode 100644
index 0000000..9e7a629
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm
@@ -0,0 +1,97 @@
+dnl ARM Neon mpn_com optimised for A15.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A8 ?
+C Cortex-A9 2.1
+C Cortex-A15 0.65
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+ cmp n, #7
+ ble L(bc)
+
+C Perform a few initial operation until rp is 128-bit aligned
+ tst rp, #4
+ beq L(al1)
+ vld1.32 {d0[0]}, [up]!
+ sub n, n, #1
+ vmvn d0, d0
+ vst1.32 {d0[0]}, [rp]!
+L(al1): tst rp, #8
+ beq L(al2)
+ vld1.32 {d0}, [up]!
+ sub n, n, #2
+ vmvn d0, d0
+ vst1.32 {d0}, [rp:64]!
+L(al2): vld1.32 {q2}, [up]!
+ subs n, n, #12
+ blt L(end)
+
+ ALIGN(16)
+L(top): vld1.32 {q0}, [up]!
+ vmvn q2, q2
+ subs n, n, #8
+ vst1.32 {q2}, [rp:128]!
+ vld1.32 {q2}, [up]!
+ vmvn q0, q0
+ vst1.32 {q0}, [rp:128]!
+ bge L(top)
+
+L(end): vmvn q2, q2
+ vst1.32 {q2}, [rp:128]!
+
+C Handle last 0-7 limbs. Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc): tst n, #4
+ beq L(tl1)
+ vld1.32 {q0}, [up]!
+ vmvn q0, q0
+ vst1.32 {q0}, [rp]!
+L(tl1): tst n, #2
+ beq L(tl2)
+ vld1.32 {d0}, [up]!
+ vmvn d0, d0
+ vst1.32 {d0}, [rp]!
+L(tl2): tst n, #1
+ beq L(tl3)
+ vld1.32 {d0[0]}, [up]
+ vmvn d0, d0
+ vst1.32 {d0[0]}, [rp]
+L(tl3): bx lr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm
new file mode 100644
index 0000000..98fe535
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm
@@ -0,0 +1,110 @@
+dnl ARM Neon mpn_copyd optimised for A15.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 1.75 slower than core register code
+C Cortex-A15 0.52
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+ add rp, rp, n, lsl #2
+ add up, up, n, lsl #2
+
+ cmp n, #7
+ ble L(bc)
+
+C Copy until rp is 128-bit aligned
+ tst rp, #4
+ beq L(al1)
+ sub up, up, #4
+ vld1.32 {d22[0]}, [up]
+ sub n, n, #1
+ sub rp, rp, #4
+ vst1.32 {d22[0]}, [rp]
+L(al1): tst rp, #8
+ beq L(al2)
+ sub up, up, #8
+ vld1.32 {d22}, [up]
+ sub n, n, #2
+ sub rp, rp, #8
+ vst1.32 {d22}, [rp:64]
+L(al2): sub up, up, #16
+ vld1.32 {d26-d27}, [up]
+ subs n, n, #12
+ sub rp, rp, #16 C offset rp for loop
+ blt L(end)
+
+ sub up, up, #16 C offset up for loop
+ mov r12, #-16
+
+ ALIGN(16)
+L(top): vld1.32 {d22-d23}, [up], r12
+ vst1.32 {d26-d27}, [rp:128], r12
+ vld1.32 {d26-d27}, [up], r12
+ vst1.32 {d22-d23}, [rp:128], r12
+ subs n, n, #8
+ bge L(top)
+
+ add up, up, #16 C undo up offset
+ C rp offset undoing folded
+L(end): vst1.32 {d26-d27}, [rp:128]
+
+C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc): tst n, #4
+ beq L(tl1)
+ sub up, up, #16
+ vld1.32 {d22-d23}, [up]
+ sub rp, rp, #16
+ vst1.32 {d22-d23}, [rp]
+L(tl1): tst n, #2
+ beq L(tl2)
+ sub up, up, #8
+ vld1.32 {d22}, [up]
+ sub rp, rp, #8
+ vst1.32 {d22}, [rp]
+L(tl2): tst n, #1
+ beq L(tl3)
+ sub up, up, #4
+ vld1.32 {d22[0]}, [up]
+ sub rp, rp, #4
+ vst1.32 {d22[0]}, [rp]
+L(tl3): bx lr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm
new file mode 100644
index 0000000..2e05afe
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm
@@ -0,0 +1,90 @@
+dnl ARM Neon mpn_copyi optimised for A15.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 1.75 slower than core register code
+C Cortex-A15 0.52
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+ cmp n, #7
+ ble L(bc)
+
+C Copy until rp is 128-bit aligned
+ tst rp, #4
+ beq L(al1)
+ vld1.32 {d22[0]}, [up]!
+ sub n, n, #1
+ vst1.32 {d22[0]}, [rp]!
+L(al1): tst rp, #8
+ beq L(al2)
+ vld1.32 {d22}, [up]!
+ sub n, n, #2
+ vst1.32 {d22}, [rp:64]!
+L(al2): vld1.32 {d26-d27}, [up]!
+ subs n, n, #12
+ blt L(end)
+
+ ALIGN(16)
+L(top): vld1.32 {d22-d23}, [up]!
+ vst1.32 {d26-d27}, [rp:128]!
+ vld1.32 {d26-d27}, [up]!
+ vst1.32 {d22-d23}, [rp:128]!
+ subs n, n, #8
+ bge L(top)
+
+L(end): vst1.32 {d26-d27}, [rp:128]!
+
+C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we
+C arrive here via L(bc)
+L(bc): tst n, #4
+ beq L(tl1)
+ vld1.32 {d22-d23}, [up]!
+ vst1.32 {d22-d23}, [rp]!
+L(tl1): tst n, #2
+ beq L(tl2)
+ vld1.32 {d22}, [up]!
+ vst1.32 {d22}, [rp]!
+L(tl2): tst n, #1
+ beq L(tl3)
+ vld1.32 {d22[0]}, [up]
+ vst1.32 {d22[0]}, [rp]
+L(tl3): bx lr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm
new file mode 100644
index 0000000..2c11d6d
--- /dev/null
+++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm
@@ -0,0 +1,177 @@
+dnl ARM Neon mpn_rsh1add_n, mpn_rsh1sub_n.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM -
+C XScale -
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 4-5
+C Cortex-A15 2.5
+
+C TODO
+C * Try to make this smaller, its size (384 bytes) is excessive.
+C * Try to reach 2.25 c/l on A15, to match the addlsh_1 family.
+C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps
+C insufficiently for A7 and A8.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n', `r3')
+
+ifdef(`OPERATION_rsh1add_n', `
+ define(`ADDSUBS', `adds $1, $2, $3')
+ define(`ADCSBCS', `adcs $1, $2, $3')
+ define(`IFADD', `$1')
+ define(`IFSUB', `')
+ define(`func', mpn_rsh1add_n)')
+ifdef(`OPERATION_rsh1sub_n', `
+ define(`ADDSUBS', `subs $1, $2, $3')
+ define(`ADCSBCS', `sbcs $1, $2, $3')
+ define(`IFADD', `')
+ define(`IFSUB', `$1')
+ define(`func', mpn_rsh1sub_n)')
+
+MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
+
+ASM_START()
+PROLOGUE(func)
+ push {r4-r10}
+
+ ands r4, n, #3
+ beq L(b00)
+ cmp r4, #2
+ blo L(b01)
+ beq L(b10)
+
+L(b11): ldmia up!, {r9,r10,r12}
+ ldmia vp!, {r5,r6,r7}
+ ADDSUBS( r9, r9, r5)
+ vmov d4, r9, r9
+ ADCSBCS( r10, r10, r6)
+ ADCSBCS( r12, r12, r7)
+ vshr.u64 d3, d4, #1
+ vmov d1, r10, r12
+ vsli.u64 d3, d1, #31
+ vshr.u64 d2, d1, #1
+ vst1.32 d3[0], [rp]!
+ bics n, n, #3
+ beq L(wd2)
+L(gt3): ldmia up!, {r8,r9,r10,r12}
+ ldmia vp!, {r4,r5,r6,r7}
+ b L(mi0)
+
+L(b10): ldmia up!, {r10,r12}
+ ldmia vp!, {r6,r7}
+ ADDSUBS( r10, r10, r6)
+ ADCSBCS( r12, r12, r7)
+ vmov d4, r10, r12
+ bics n, n, #2
+ vshr.u64 d2, d4, #1
+ beq L(wd2)
+L(gt2): ldmia up!, {r8,r9,r10,r12}
+ ldmia vp!, {r4,r5,r6,r7}
+ b L(mi0)
+
+L(b01): ldr r12, [up], #4
+ ldr r7, [vp], #4
+ ADDSUBS( r12, r12, r7)
+ vmov d4, r12, r12
+ bics n, n, #1
+ bne L(gt1)
+ mov r5, r12, lsr #1
+IFADD(` adc r1, n, #0')
+IFSUB(` adc r1, n, #1')
+ bfi r5, r1, #31, #1
+ str r5, [rp]
+ and r0, r12, #1
+ pop {r4-r10}
+ bx r14
+L(gt1): ldmia up!, {r8,r9,r10,r12}
+ ldmia vp!, {r4,r5,r6,r7}
+ vshr.u64 d2, d4, #1
+ ADCSBCS( r8, r8, r4)
+ ADCSBCS( r9, r9, r5)
+ vmov d0, r8, r9
+ ADCSBCS( r10, r10, r6)
+ ADCSBCS( r12, r12, r7)
+ vsli.u64 d2, d0, #31
+ vshr.u64 d3, d0, #1
+ vst1.32 d2[0], [rp]!
+ b L(mi1)
+
+L(b00): ldmia up!, {r8,r9,r10,r12}
+ ldmia vp!, {r4,r5,r6,r7}
+ ADDSUBS( r8, r8, r4)
+ ADCSBCS( r9, r9, r5)
+ vmov d4, r8, r9
+ ADCSBCS( r10, r10, r6)
+ ADCSBCS( r12, r12, r7)
+ vshr.u64 d3, d4, #1
+ b L(mi1)
+
+ ALIGN(16)
+L(top): ldmia up!, {r8,r9,r10,r12}
+ ldmia vp!, {r4,r5,r6,r7}
+ vsli.u64 d3, d1, #63
+ vshr.u64 d2, d1, #1
+ vst1.32 d3, [rp]!
+L(mi0): ADCSBCS( r8, r8, r4)
+ ADCSBCS( r9, r9, r5)
+ vmov d0, r8, r9
+ ADCSBCS( r10, r10, r6)
+ ADCSBCS( r12, r12, r7)
+ vsli.u64 d2, d0, #63
+ vshr.u64 d3, d0, #1
+ vst1.32 d2, [rp]!
+L(mi1): vmov d1, r10, r12
+ sub n, n, #4
+ tst n, n
+ bne L(top)
+
+L(end): vsli.u64 d3, d1, #63
+ vshr.u64 d2, d1, #1
+ vst1.32 d3, [rp]!
+L(wd2): vmov r4, r5, d2
+IFADD(` adc r1, n, #0')
+IFSUB(` adc r1, n, #1')
+ bfi r5, r1, #31, #1
+ stm rp, {r4,r5}
+
+L(rtn): vmov.32 r0, d4[0]
+ and r0, r0, #1
+ pop {r4-r10}
+ bx r14
+EPILOGUE()