From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/arm/README | 35 ++ gmp-6.3.0/mpn/arm/aors_n.asm | 112 +++++ gmp-6.3.0/mpn/arm/aorslsh1_n.asm | 167 +++++++ gmp-6.3.0/mpn/arm/aorsmul_1.asm | 135 +++++ gmp-6.3.0/mpn/arm/arm-defs.m4 | 100 ++++ gmp-6.3.0/mpn/arm/bdiv_dbm1c.asm | 113 +++++ gmp-6.3.0/mpn/arm/bdiv_q_1.asm | 162 ++++++ gmp-6.3.0/mpn/arm/cnd_aors_n.asm | 134 +++++ gmp-6.3.0/mpn/arm/com.asm | 75 +++ gmp-6.3.0/mpn/arm/copyd.asm | 84 ++++ gmp-6.3.0/mpn/arm/copyi.asm | 79 +++ gmp-6.3.0/mpn/arm/dive_1.asm | 151 ++++++ gmp-6.3.0/mpn/arm/gmp-mparam.h | 127 +++++ gmp-6.3.0/mpn/arm/invert_limb.asm | 93 ++++ gmp-6.3.0/mpn/arm/logops_n.asm | 139 ++++++ gmp-6.3.0/mpn/arm/lshift.asm | 88 ++++ gmp-6.3.0/mpn/arm/lshiftc.asm | 95 ++++ gmp-6.3.0/mpn/arm/mod_34lsub1.asm | 124 +++++ gmp-6.3.0/mpn/arm/mode1o.asm | 92 ++++ gmp-6.3.0/mpn/arm/mul_1.asm | 94 ++++ gmp-6.3.0/mpn/arm/neon/README | 2 + gmp-6.3.0/mpn/arm/neon/hamdist.asm | 194 ++++++++ gmp-6.3.0/mpn/arm/neon/lorrshift.asm | 279 +++++++++++ gmp-6.3.0/mpn/arm/neon/lshiftc.asm | 242 +++++++++ gmp-6.3.0/mpn/arm/neon/popcount.asm | 166 +++++++ gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm | 140 ++++++ gmp-6.3.0/mpn/arm/rsh1aors_n.asm | 124 +++++ gmp-6.3.0/mpn/arm/rshift.asm | 86 ++++ gmp-6.3.0/mpn/arm/sec_tabselect.asm | 131 +++++ gmp-6.3.0/mpn/arm/udiv.asm | 104 ++++ gmp-6.3.0/mpn/arm/v5/gcd_11.asm | 70 +++ gmp-6.3.0/mpn/arm/v5/gcd_22.asm | 117 +++++ gmp-6.3.0/mpn/arm/v5/mod_1_1.asm | 129 +++++ gmp-6.3.0/mpn/arm/v5/mod_1_2.asm | 156 ++++++ gmp-6.3.0/mpn/arm/v6/addmul_1.asm | 112 +++++ gmp-6.3.0/mpn/arm/v6/addmul_2.asm | 125 +++++ gmp-6.3.0/mpn/arm/v6/addmul_3.asm | 191 ++++++++ gmp-6.3.0/mpn/arm/v6/dive_1.asm | 149 ++++++ gmp-6.3.0/mpn/arm/v6/gmp-mparam.h | 187 +++++++ gmp-6.3.0/mpn/arm/v6/mode1o.asm | 95 ++++ gmp-6.3.0/mpn/arm/v6/mul_1.asm | 115 +++++ gmp-6.3.0/mpn/arm/v6/mul_2.asm | 135 +++++ gmp-6.3.0/mpn/arm/v6/popham.asm | 139 ++++++ gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm | 544 +++++++++++++++++++++ gmp-6.3.0/mpn/arm/v6/submul_1.asm | 125 +++++ gmp-6.3.0/mpn/arm/v6t2/divrem_1.asm | 212 ++++++++ gmp-6.3.0/mpn/arm/v6t2/gcd_11.asm | 65 +++ gmp-6.3.0/mpn/arm/v6t2/gcd_22.asm | 113 +++++ gmp-6.3.0/mpn/arm/v7a/cora15/addmul_1.asm | 145 ++++++ gmp-6.3.0/mpn/arm/v7a/cora15/aors_n.asm | 162 ++++++ gmp-6.3.0/mpn/arm/v7a/cora15/bdiv_q_1.asm | 36 ++ gmp-6.3.0/mpn/arm/v7a/cora15/cnd_aors_n.asm | 158 ++++++ gmp-6.3.0/mpn/arm/v7a/cora15/com.asm | 180 +++++++ gmp-6.3.0/mpn/arm/v7a/cora15/gmp-mparam.h | 212 ++++++++ gmp-6.3.0/mpn/arm/v7a/cora15/logops_n.asm | 253 ++++++++++ gmp-6.3.0/mpn/arm/v7a/cora15/mul_1.asm | 104 ++++ .../mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm | 43 ++ .../mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm | 43 ++ .../mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm | 144 ++++++ gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm | 97 ++++ gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm | 110 +++++ gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm | 90 ++++ gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm | 177 +++++++ gmp-6.3.0/mpn/arm/v7a/cora15/submul_1.asm | 159 ++++++ gmp-6.3.0/mpn/arm/v7a/cora17/addmul_1.asm | 34 ++ gmp-6.3.0/mpn/arm/v7a/cora17/gmp-mparam.h | 233 +++++++++ gmp-6.3.0/mpn/arm/v7a/cora17/mod_34lsub1.asm | 121 +++++ gmp-6.3.0/mpn/arm/v7a/cora17/mul_1.asm | 34 ++ gmp-6.3.0/mpn/arm/v7a/cora17/submul_1.asm | 34 ++ gmp-6.3.0/mpn/arm/v7a/cora5/gmp-mparam.h | 205 ++++++++ gmp-6.3.0/mpn/arm/v7a/cora7/gmp-mparam.h | 202 ++++++++ gmp-6.3.0/mpn/arm/v7a/cora8/bdiv_q_1.asm | 158 ++++++ gmp-6.3.0/mpn/arm/v7a/cora8/gmp-mparam.h | 207 ++++++++ gmp-6.3.0/mpn/arm/v7a/cora9/bdiv_q_1.asm | 36 ++ gmp-6.3.0/mpn/arm/v7a/cora9/gmp-mparam.h | 211 ++++++++ 75 files changed, 10034 insertions(+) create mode 100644 gmp-6.3.0/mpn/arm/README create mode 100644 gmp-6.3.0/mpn/arm/aors_n.asm create mode 100644 gmp-6.3.0/mpn/arm/aorslsh1_n.asm create mode 100644 gmp-6.3.0/mpn/arm/aorsmul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/arm-defs.m4 create mode 100644 gmp-6.3.0/mpn/arm/bdiv_dbm1c.asm create mode 100644 gmp-6.3.0/mpn/arm/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/arm/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/arm/com.asm create mode 100644 gmp-6.3.0/mpn/arm/copyd.asm create mode 100644 gmp-6.3.0/mpn/arm/copyi.asm create mode 100644 gmp-6.3.0/mpn/arm/dive_1.asm create mode 100644 gmp-6.3.0/mpn/arm/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm/invert_limb.asm create mode 100644 gmp-6.3.0/mpn/arm/logops_n.asm create mode 100644 gmp-6.3.0/mpn/arm/lshift.asm create mode 100644 gmp-6.3.0/mpn/arm/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/arm/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/arm/mode1o.asm create mode 100644 gmp-6.3.0/mpn/arm/mul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/neon/README create mode 100644 gmp-6.3.0/mpn/arm/neon/hamdist.asm create mode 100644 gmp-6.3.0/mpn/arm/neon/lorrshift.asm create mode 100644 gmp-6.3.0/mpn/arm/neon/lshiftc.asm create mode 100644 gmp-6.3.0/mpn/arm/neon/popcount.asm create mode 100644 gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/arm/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/arm/rshift.asm create mode 100644 gmp-6.3.0/mpn/arm/sec_tabselect.asm create mode 100644 gmp-6.3.0/mpn/arm/udiv.asm create mode 100644 gmp-6.3.0/mpn/arm/v5/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/arm/v5/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/arm/v5/mod_1_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v5/mod_1_2.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/addmul_2.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/addmul_3.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/dive_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm/v6/mode1o.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/mul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/mul_2.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/popham.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm create mode 100644 gmp-6.3.0/mpn/arm/v6/submul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v6t2/divrem_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v6t2/gcd_11.asm create mode 100644 gmp-6.3.0/mpn/arm/v6t2/gcd_22.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/aors_n.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/cnd_aors_n.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/com.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/logops_n.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/mul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora15/submul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora17/addmul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora17/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora17/mod_34lsub1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora17/mul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora17/submul_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora5/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora7/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora8/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora8/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora9/bdiv_q_1.asm create mode 100644 gmp-6.3.0/mpn/arm/v7a/cora9/gmp-mparam.h (limited to 'gmp-6.3.0/mpn/arm') diff --git a/gmp-6.3.0/mpn/arm/README b/gmp-6.3.0/mpn/arm/README new file mode 100644 index 0000000..53c7214 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/README @@ -0,0 +1,35 @@ +Copyright 2002, 2012, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. + + + + + +This directory contains mpn functions for ARM processors. It has been +optimised mainly for Cortex-A9 and Cortex-A15, but the code in the top-level +directory should run on all ARM processors at architecture level v4 or later. diff --git a/gmp-6.3.0/mpn/arm/aors_n.asm b/gmp-6.3.0/mpn/arm/aors_n.asm new file mode 100644 index 0000000..fdad9f7 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/aors_n.asm @@ -0,0 +1,112 @@ +dnl ARM mpn_add_n and mpn_sub_n + +dnl Contributed to the GNU project by Robert Harley. + +dnl Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 2.5 slightly fluctuating +C Cortex-A15 2.25 + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +ifdef(`OPERATION_add_n', ` + define(`ADDSUB', adds) + define(`ADDSUBC', adcs) + define(`CLRCY', `cmn r0, #0') + define(`SETCY', `cmp $1, #1') + define(`RETVAL', `adc r0, n, #0') + define(`func', mpn_add_n) + define(`func_nc', mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(`ADDSUB', subs) + define(`ADDSUBC', sbcs) + define(`CLRCY', `cmp r0, r0') + define(`SETCY', `rsbs $1, $1, #0') + define(`RETVAL', `sbc r0, r0, r0 + and r0, r0, #1') + define(`func', mpn_sub_n) + define(`func_nc', mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(func_nc) + ldr r12, [sp, #0] + stmfd sp!, { r8, r9, lr } + SETCY( r12) + b L(ent) +EPILOGUE() +PROLOGUE(func) + stmfd sp!, { r8, r9, lr } + CLRCY( r12) +L(ent): tst n, #1 + beq L(skip1) + ldr r12, [up], #4 + ldr lr, [vp], #4 + ADDSUBC r12, r12, lr + str r12, [rp], #4 +L(skip1): + tst n, #2 + beq L(skip2) + ldmia up!, { r8, r9 } + ldmia vp!, { r12, lr } + ADDSUBC r8, r8, r12 + ADDSUBC r9, r9, lr + stmia rp!, { r8, r9 } +L(skip2): + bics n, n, #3 + beq L(rtn) + stmfd sp!, { r4, r5, r6, r7 } + +L(top): ldmia up!, { r4, r5, r6, r7 } + ldmia vp!, { r8, r9, r12, lr } + ADDSUBC r4, r4, r8 + sub n, n, #4 + ADDSUBC r5, r5, r9 + ADDSUBC r6, r6, r12 + ADDSUBC r7, r7, lr + stmia rp!, { r4, r5, r6, r7 } + teq n, #0 + bne L(top) + + ldmfd sp!, { r4, r5, r6, r7 } + +L(rtn): RETVAL + ldmfd sp!, { r8, r9, pc } +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/aorslsh1_n.asm b/gmp-6.3.0/mpn/arm/aorslsh1_n.asm new file mode 100644 index 0000000..889e654 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/aorslsh1_n.asm @@ -0,0 +1,167 @@ +dnl ARM mpn_addlsh1_n and mpn_sublsh1_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C addlsh1_n sublsh1_n +C cycles/limb cycles/limb +C StrongARM ? ? +C XScale ? ? +C Cortex-A7 ? ? +C Cortex-A8 ? ? +C Cortex-A9 3.12 3.7 +C Cortex-A15 ? ? + +C TODO +C * The addlsh1_n code runs well, but is only barely faster than mpn_addmul_1. +C The sublsh1_n code could surely be tweaked, its REVCY slows down things +C very much. If two insns are really needed, it might help to separate them +C for better micro-parallelism. + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +ifdef(`OPERATION_addlsh1_n', ` + define(`ADDSUB', adds) + define(`ADDSUBC', adcs) + define(`SETCY', `cmp $1, #1') + define(`RETVAL', `adc r0, $1, #2') + define(`SAVECY', `sbc $1, $2, #0') + define(`RESTCY', `cmn $1, #1') + define(`REVCY', `') + define(`INICYR', `mov $1, #0') + define(`r10r11', `r11') + define(`func', mpn_addlsh1_n) + define(`func_nc', mpn_addlsh1_nc)') +ifdef(`OPERATION_sublsh1_n', ` + define(`ADDSUB', subs) + define(`ADDSUBC', sbcs) + define(`SETCY', `rsbs $1, $1, #0') + define(`RETVAL', `adc r0, $1, #1') + define(`SAVECY', `sbc $1, $1, $1') + define(`RESTCY', `cmn $1, #1') + define(`REVCY', `sbc $1, $1, $1 + cmn $1, #1') + define(`INICYR', `mvn $1, #0') + define(`r10r11', `r10') + define(`func', mpn_sublsh1_n) + define(`func_nc', mpn_sublsh1_nc)') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n) + +ASM_START() +PROLOGUE(func) + push {r4-r10r11, r14} + +ifdef(`OPERATION_addlsh1_n', ` + mvn r11, #0 +') + INICYR( r14) + subs n, n, #3 + blt L(le2) C carry clear on branch path + + cmn r0, #0 C clear carry + ldmia vp!, {r8, r9, r10} + b L(mid) + +L(top): RESTCY( r14) + ADDSUBC r4, r4, r8 + ADDSUBC r5, r5, r9 + ADDSUBC r6, r6, r10 + ldmia vp!, {r8, r9, r10} + stmia rp!, {r4, r5, r6} + REVCY(r14) + adcs r8, r8, r8 + adcs r9, r9, r9 + adcs r10, r10, r10 + ldmia up!, {r4, r5, r6} + SAVECY( r14, r11) + subs n, n, #3 + blt L(exi) + RESTCY( r12) + ADDSUBC r4, r4, r8 + ADDSUBC r5, r5, r9 + ADDSUBC r6, r6, r10 + ldmia vp!, {r8, r9, r10} + stmia rp!, {r4, r5, r6} + REVCY(r12) +L(mid): adcs r8, r8, r8 + adcs r9, r9, r9 + adcs r10, r10, r10 + ldmia up!, {r4, r5, r6} + SAVECY( r12, r11) + subs n, n, #3 + bge L(top) + + mov r7, r12 C swap alternating... + mov r12, r14 C ...carry-save... + mov r14, r7 C ...registers + +L(exi): RESTCY( r12) + ADDSUBC r4, r4, r8 + ADDSUBC r5, r5, r9 + ADDSUBC r6, r6, r10 + stmia rp!, {r4, r5, r6} + + REVCY(r12) +L(le2): tst n, #1 C n = {-1,-2,-3} map to [2], [1], [0] + beq L(e1) + +L(e02): tst n, #2 + beq L(rt0) + ldm vp, {r8, r9} + adcs r8, r8, r8 + adcs r9, r9, r9 + ldm up, {r4, r5} + SAVECY( r12, r11) + RESTCY( r14) + ADDSUBC r4, r4, r8 + ADDSUBC r5, r5, r9 + stm rp, {r4, r5} + b L(rt1) + +L(e1): ldr r8, [vp] + adcs r8, r8, r8 + ldr r4, [up] + SAVECY( r12, r11) + RESTCY( r14) + ADDSUBC r4, r4, r8 + str r4, [rp] + +L(rt1): mov r14, r12 + REVCY(r12) +L(rt0): RETVAL( r14) + pop {r4-r10r11, r14} + return r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/aorsmul_1.asm b/gmp-6.3.0/mpn/arm/aorsmul_1.asm new file mode 100644 index 0000000..b02fbb3 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/aorsmul_1.asm @@ -0,0 +1,135 @@ +dnl ARM mpn_addmul_1 and mpn_submul_1. + +dnl Copyright 1998, 2000, 2001, 2003, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 5.25 +C Cortex-A15 4 + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`vl', `r3') +define(`rl', `r12') +define(`ul', `r6') +define(`r', `lr') + +ifdef(`OPERATION_addmul_1', ` + define(`ADDSUB', adds) + define(`ADDSUBC', adcs) + define(`CLRRCY', `mov $1, #0 + adds r0, r0, #0') + define(`RETVAL', `adc r0, r4, #0') + define(`func', mpn_addmul_1)') +ifdef(`OPERATION_submul_1', ` + define(`ADDSUB', subs) + define(`ADDSUBC', sbcs) + define(`CLRRCY', `subs $1, r0, r0') + define(`RETVAL', `sbc r0, r0, r0 + sub r0, $1, r0') + define(`func', mpn_submul_1)') + +MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) + +ASM_START() +PROLOGUE(func) + stmfd sp!, { r4-r6, lr } + CLRRCY( r4) + tst n, #1 + beq L(skip1) + ldr ul, [up], #4 + ldr rl, [rp, #0] + umull r5, r4, ul, vl + ADDSUB r, rl, r5 + str r, [rp], #4 +L(skip1): + tst n, #2 + beq L(skip2) + ldr ul, [up], #4 + ldr rl, [rp, #0] + mov r5, #0 + umlal r4, r5, ul, vl + ldr ul, [up], #4 + ADDSUBC r, rl, r4 + ldr rl, [rp, #4] + mov r4, #0 + umlal r5, r4, ul, vl + str r, [rp], #4 + ADDSUBC r, rl, r5 + str r, [rp], #4 +L(skip2): + bics n, n, #3 + beq L(rtn) + + ldr ul, [up], #4 + ldr rl, [rp, #0] + mov r5, #0 + umlal r4, r5, ul, vl + b L(in) + +L(top): ldr ul, [up], #4 + ADDSUBC r, rl, r5 + ldr rl, [rp, #4] + mov r5, #0 + umlal r4, r5, ul, vl + str r, [rp], #4 +L(in): ldr ul, [up], #4 + ADDSUBC r, rl, r4 + ldr rl, [rp, #4] + mov r4, #0 + umlal r5, r4, ul, vl + str r, [rp], #4 + ldr ul, [up], #4 + ADDSUBC r, rl, r5 + ldr rl, [rp, #4] + mov r5, #0 + umlal r4, r5, ul, vl + str r, [rp], #4 + ldr ul, [up], #4 + ADDSUBC r, rl, r4 + ldr rl, [rp, #4] + mov r4, #0 + umlal r5, r4, ul, vl + sub n, n, #4 + tst n, n + str r, [rp], #4 + bne L(top) + + ADDSUBC r, rl, r5 + str r, [rp] + +L(rtn): RETVAL( r4) + ldmfd sp!, { r4-r6, pc } +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/arm-defs.m4 b/gmp-6.3.0/mpn/arm/arm-defs.m4 new file mode 100644 index 0000000..4b4fa0b --- /dev/null +++ b/gmp-6.3.0/mpn/arm/arm-defs.m4 @@ -0,0 +1,100 @@ +divert(-1) + +dnl m4 macros for ARM assembler. + +dnl Copyright 2001, 2012-2016, 2018-2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +dnl Standard commenting is with @, the default m4 # is for constants and we +dnl don't want to disable macro expansions in or after them. + +changecom(@&*$) + +define(`ASM_START', +m4_assert_numargs_range(0,1) +`ifelse($1,`neon',`.fpu neon', + $1,,`', + 1,1,`m4_error(`$0 got invalid argument $1')')') + +dnl APCS register names. + +deflit(a1,r0) +deflit(a2,r1) +deflit(a3,r2) +deflit(a4,r3) +dnl deflit(v1,r4) +dnl deflit(v2,r5) +dnl deflit(v3,r6) +dnl deflit(v4,r7) +dnl deflit(v5,r8) +dnl deflit(v6,r9) +deflit(sb,r9) +dnl deflit(v7,r10) +deflit(sl,r10) +deflit(fp,r11) +deflit(ip,r12) +dnl deflit(sp,r13) +deflit(lr,r14) +deflit(pc,r15) + + +define(`lea_list', `') +define(`lea_num',0) + +dnl LEA(reg,gmp_symbol) +dnl +dnl Load the address of gmp_symbol into a register. The gmp_symbol must be +dnl either local or protected/hidden, since we assume it has a fixed distance +dnl from the point of use. + +define(`LEA',`dnl +ldr $1, L(ptr`'lea_num) +ifdef(`PIC',dnl +`dnl +L(bas`'lea_num):dnl + add $1, $1, pc`'dnl + m4append(`lea_list',` +L(ptr'lea_num`): .word GSYM_PREFIX`'$2-L(bas'lea_num`)-8') + define(`lea_num', eval(lea_num+1))dnl +',`dnl + m4append(`lea_list',` +L(ptr'lea_num`): .word GSYM_PREFIX`'$2') + define(`lea_num', eval(lea_num+1))dnl +')dnl +') + +define(`return',`ifdef(`NOTHUMB',`mov pc, ',`bx')') + + +define(`EPILOGUE_cpu', +`lea_list + SIZE(`$1',.-`$1')' +`define(`lea_list', `')') + +divert diff --git a/gmp-6.3.0/mpn/arm/bdiv_dbm1c.asm b/gmp-6.3.0/mpn/arm/bdiv_dbm1c.asm new file mode 100644 index 0000000..b919dc4 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/bdiv_dbm1c.asm @@ -0,0 +1,113 @@ +dnl ARM mpn_bdiv_dbm1c. + +dnl Copyright 2008, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 4.25 +C Cortex-A15 2.5 + +C TODO +C * Try using umlal or umaal. +C * Try using ldm/stm. + +define(`qp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`bd', `r3') +define(`cy', `sp,#0') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_bdiv_dbm1c) + push {r4, r5, r6, r7, r8} + ldr r4, [up], #4 + ldr r5, [sp, #20] + ands r12, n, #3 + beq L(fi0) + cmp r12, #2 + bcc L(fi1) + beq L(fi2) + +L(fi3): umull r8, r12, r4, bd + ldr r4, [up], #4 + b L(lo3) + +L(fi0): umull r6, r7, r4, bd + ldr r4, [up], #4 + b L(lo0) + +L(fi1): subs n, n, #1 + umull r8, r12, r4, bd + bls L(wd1) + ldr r4, [up], #4 + b L(lo1) + +L(fi2): umull r6, r7, r4, bd + ldr r4, [up], #4 + b L(lo2) + +L(top): ldr r4, [up], #4 + subs r5, r5, r6 + str r5, [qp], #4 + sbc r5, r5, r7 +L(lo1): umull r6, r7, r4, bd + ldr r4, [up], #4 + subs r5, r5, r8 + str r5, [qp], #4 + sbc r5, r5, r12 +L(lo0): umull r8, r12, r4, bd + ldr r4, [up], #4 + subs r5, r5, r6 + str r5, [qp], #4 + sbc r5, r5, r7 +L(lo3): umull r6, r7, r4, bd + ldr r4, [up], #4 + subs r5, r5, r8 + str r5, [qp], #4 + sbc r5, r5, r12 +L(lo2): subs n, n, #4 + umull r8, r12, r4, bd + bhi L(top) + +L(wd2): subs r5, r5, r6 + str r5, [qp], #4 + sbc r5, r5, r7 +L(wd1): subs r5, r5, r8 + str r5, [qp] + sbc r0, r5, r12 + pop {r4, r5, r6, r7, r8} + return lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/bdiv_q_1.asm b/gmp-6.3.0/mpn/arm/bdiv_q_1.asm new file mode 100644 index 0000000..ae395d1 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/bdiv_q_1.asm @@ -0,0 +1,162 @@ +dnl ARM v4 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm unorm +C 1176 13 18 +C Cortex-A5 8 12 +C Cortex-A7 10.5 18 +C Cortex-A8 14 15 +C Cortex-A9 10 12 not measured since latest edits +C Cortex-A15 9 9 +C Cortex-A53 14 20 + +C Architecture requirements: +C v5 - +C v5t - +C v5te - +C v6 - +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`d', `r3') +define(`di_arg', `sp[0]') C just mpn_pi1_bdiv_q_1 +define(`cnt_arg', `sp[4]') C just mpn_pi1_bdiv_q_1 + +define(`cy', `r7') +define(`cnt', `r6') +define(`tnc', `r8') + +ASM_START() +PROLOGUE(mpn_bdiv_q_1) + tst d, #1 + push {r6-r11} + mov cnt, #0 + bne L(inv) + +C count trailing zeros + movs r10, d, lsl #16 + moveq d, d, lsr #16 + moveq cnt, #16 + tst d, #0xff + moveq d, d, lsr #8 + addeq cnt, cnt, #8 + LEA( r10, ctz_tab) + and r11, d, #0xff + ldrb r10, [r10, r11] + mov d, d, lsr r10 + add cnt, cnt, r10 + +C binvert limb +L(inv): LEA( r10, binvert_limb_table) + and r12, d, #254 + ldrb r10, [r10, r12, lsr #1] + mul r12, r10, r10 + mul r12, d, r12 + rsb r12, r12, r10, lsl #1 + mul r10, r12, r12 + mul r10, d, r10 + rsb r10, r10, r12, lsl #1 C r10 = inverse + b L(pi1) +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + push {r6-r11} + + ldr cnt, [sp, #28] + ldr r10, [sp, #24] + +L(pi1): ldr r11, [up], #4 C up[0] + cmp cnt, #0 + mov cy, #0 + bne L(unorm) + +L(norm): + subs n, n, #1 C set carry as side-effect + beq L(edn) + + ALIGN(16) +L(tpn): sbcs cy, r11, cy + ldr r11, [up], #4 + sub n, n, #1 + mul r9, r10, cy + tst n, n + umull r12, cy, d, r9 + str r9, [rp], #4 + bne L(tpn) + +L(edn): sbc cy, r11, cy + mul r9, r10, cy + str r9, [rp] + pop {r6-r11} + return r14 + +L(unorm): + rsb tnc, cnt, #32 + mov r11, r11, lsr cnt + subs n, n, #1 C set carry as side-effect + beq L(edu) + + ALIGN(16) +L(tpu): ldr r12, [up], #4 + orr r9, r11, r12, lsl tnc + mov r11, r12, lsr cnt + sbcs cy, r9, cy C critical path ->cy->cy-> + sub n, n, #1 + mul r9, r10, cy C critical path ->cy->r9-> + tst n, n + umull r12, cy, d, r9 C critical path ->r9->cy-> + str r9, [rp], #4 + bne L(tpu) + +L(edu): sbc cy, r11, cy + mul r9, r10, cy + str r9, [rp] + pop {r6-r11} + return r14 +EPILOGUE() + + RODATA +ctz_tab: + .byte 8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 diff --git a/gmp-6.3.0/mpn/arm/cnd_aors_n.asm b/gmp-6.3.0/mpn/arm/cnd_aors_n.asm new file mode 100644 index 0000000..0479f0d --- /dev/null +++ b/gmp-6.3.0/mpn/arm/cnd_aors_n.asm @@ -0,0 +1,134 @@ +dnl ARM mpn_cnd_add_n, mpn_cnd_sub_n + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 3 +C Cortex-A15 2.5 + +define(`cnd', `r0') +define(`rp', `r1') +define(`up', `r2') +define(`vp', `r3') + +define(`n', `r12') + + +ifdef(`OPERATION_cnd_add_n', ` + define(`ADDSUB', adds) + define(`ADDSUBC', adcs) + define(`INITCY', `cmn r0, #0') + define(`RETVAL', `adc r0, n, #0') + define(func, mpn_cnd_add_n)') +ifdef(`OPERATION_cnd_sub_n', ` + define(`ADDSUB', subs) + define(`ADDSUBC', sbcs) + define(`INITCY', `cmp r0, #0') + define(`RETVAL', `adc r0, n, #0 + rsb r0, r0, #1') + define(func, mpn_cnd_sub_n)') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() +PROLOGUE(func) + push {r4-r11} + ldr n, [sp, #32] + + cmp cnd, #1 + sbc cnd, cnd, cnd C conditionally set to 0xffffffff + + INITCY C really only needed for n = 0 (mod 4) + + ands r4, n, #3 + beq L(top) + cmp r4, #2 + bcc L(b1) + beq L(b2) + +L(b3): ldm vp!, {r4,r5,r6} + ldm up!, {r8,r9,r10} + bic r4, r4, cnd + bic r5, r5, cnd + bic r6, r6, cnd + ADDSUB r8, r8, r4 + ADDSUBC r9, r9, r5 + ADDSUBC r10, r10, r6 + stm rp!, {r8,r9,r10} + sub n, n, #3 + teq n, #0 + bne L(top) + b L(end) + +L(b2): ldm vp!, {r4,r5} + ldm up!, {r8,r9} + bic r4, r4, cnd + bic r5, r5, cnd + ADDSUB r8, r8, r4 + ADDSUBC r9, r9, r5 + stm rp!, {r8,r9} + sub n, n, #2 + teq n, #0 + bne L(top) + b L(end) + +L(b1): ldr r4, [vp], #4 + ldr r8, [up], #4 + bic r4, r4, cnd + ADDSUB r8, r8, r4 + str r8, [rp], #4 + sub n, n, #1 + teq n, #0 + beq L(end) + +L(top): ldm vp!, {r4,r5,r6,r7} + ldm up!, {r8,r9,r10,r11} + bic r4, r4, cnd + bic r5, r5, cnd + bic r6, r6, cnd + bic r7, r7, cnd + ADDSUBC r8, r8, r4 + ADDSUBC r9, r9, r5 + ADDSUBC r10, r10, r6 + ADDSUBC r11, r11, r7 + sub n, n, #4 + stm rp!, {r8,r9,r10,r11} + teq n, #0 + bne L(top) + +L(end): RETVAL + pop {r4-r11} + return r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/com.asm b/gmp-6.3.0/mpn/arm/com.asm new file mode 100644 index 0000000..850b10a --- /dev/null +++ b/gmp-6.3.0/mpn/arm/com.asm @@ -0,0 +1,75 @@ +dnl ARM mpn_com. + +dnl Copyright 2003, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 2.0 +C Cortex-A15 1.75 + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_com) + tst n, #1 + beq L(skip1) + ldr r3, [up], #4 + mvn r3, r3 + str r3, [rp], #4 +L(skip1): + tst n, #2 + beq L(skip2) + ldmia up!, { r3, r12 } C load 2 limbs + mvn r3, r3 + mvn r12, r12 + stmia rp!, { r3, r12 } C store 2 limbs +L(skip2): + bics n, n, #3 + beq L(rtn) + stmfd sp!, { r7, r8, r9 } C save regs on stack + +L(top): ldmia up!, { r3, r8, r9, r12 } C load 4 limbs + subs n, n, #4 + mvn r3, r3 + mvn r8, r8 + mvn r9, r9 + mvn r12, r12 + stmia rp!, { r3, r8, r9, r12 } C store 4 limbs + bne L(top) + + ldmfd sp!, { r7, r8, r9 } C restore regs from stack +L(rtn): return lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/copyd.asm b/gmp-6.3.0/mpn/arm/copyd.asm new file mode 100644 index 0000000..bcad98d --- /dev/null +++ b/gmp-6.3.0/mpn/arm/copyd.asm @@ -0,0 +1,84 @@ +dnl ARM mpn_copyd. + +dnl Contributed to the GNU project by Robert Harley and Torbjörn Granlund. + +dnl Copyright 2003, 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.25-1.5 +C Cortex-A15 1.25 + +C TODO +C * Consider wider unrolling. Analogous 8-way code runs 10% faster on both A9 +C and A15. But it probably slows things down for 8 <= n < a few dozen. + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_copyd) + mov r12, n, lsl #2 + sub r12, r12, #4 + add rp, rp, r12 + add up, up, r12 + + tst n, #1 + beq L(skip1) + ldr r3, [up], #-4 + str r3, [rp], #-4 +L(skip1): + tst n, #2 + beq L(skip2) + ldmda up!, { r3,r12 } + stmda rp!, { r3,r12 } +L(skip2): + bics n, n, #3 + beq L(rtn) + + push { r4-r5 } + subs n, n, #4 + ldmda up!, { r3,r4,r5,r12 } + beq L(end) + +L(top): subs n, n, #4 + stmda rp!, { r3,r4,r5,r12 } + ldmda up!, { r3,r4,r5,r12 } + bne L(top) + +L(end): stmda rp, { r3,r4,r5,r12 } + pop { r4-r5 } +L(rtn): return lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/copyi.asm b/gmp-6.3.0/mpn/arm/copyi.asm new file mode 100644 index 0000000..421930f --- /dev/null +++ b/gmp-6.3.0/mpn/arm/copyi.asm @@ -0,0 +1,79 @@ +dnl ARM mpn_copyi. + +dnl Contributed to the GNU project by Robert Harley and Torbjörn Granlund. + +dnl Copyright 2003, 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.25-1.5 +C Cortex-A15 1.25 + +C TODO +C * Consider wider unrolling. Analogous 8-way code runs 10% faster on both A9 +C and A15. But it probably slows things down for 8 <= n < a few dozen. + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_copyi) + tst n, #1 + beq L(skip1) + ldr r3, [up], #4 + str r3, [rp], #4 +L(skip1): + tst n, #2 + beq L(skip2) + ldmia up!, { r3,r12 } + stmia rp!, { r3,r12 } +L(skip2): + bics n, n, #3 + beq L(rtn) + + push { r4-r5 } + subs n, n, #4 + ldmia up!, { r3,r4,r5,r12 } + beq L(end) + +L(top): subs n, n, #4 + stmia rp!, { r3,r4,r5,r12 } + ldmia up!, { r3,r4,r5,r12 } + bne L(top) + +L(end): stm rp, { r3,r4,r5,r12 } + pop { r4-r5 } +L(rtn): return lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/dive_1.asm b/gmp-6.3.0/mpn/arm/dive_1.asm new file mode 100644 index 0000000..8bffb0c --- /dev/null +++ b/gmp-6.3.0/mpn/arm/dive_1.asm @@ -0,0 +1,151 @@ +dnl ARM v4 mpn_divexact_1. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb +C norm unorm modexact_1c_odd +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 10 12 +C Cortex-A15 9 9 + +C Architecture requirements: +C v5 - +C v5t - +C v5te - +C v6 - +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`d', `r3') + +define(`cy', `r7') +define(`cnt', `r6') +define(`tnc', `r8') + +ASM_START() +PROLOGUE(mpn_divexact_1) + tst d, #1 + push {r4-r9} + mov cnt, #0 + bne L(inv) + +C count trailing zeros + movs r4, d, lsl #16 + moveq d, d, lsr #16 + moveq cnt, #16 + tst d, #0xff + moveq d, d, lsr #8 + addeq cnt, cnt, #8 + LEA( r4, ctz_tab) + and r5, d, #0xff + ldrb r4, [r4, r5] + mov d, d, lsr r4 + add cnt, cnt, r4 + +C binvert limb +L(inv): LEA( r4, binvert_limb_table) + and r12, d, #254 + ldrb r4, [r4, r12, lsr #1] + mul r12, r4, r4 + mul r12, d, r12 + rsb r12, r12, r4, lsl #1 + mul r4, r12, r12 + mul r4, d, r4 + rsb r4, r4, r12, lsl #1 C r4 = inverse + + tst cnt, cnt + ldr r5, [up], #4 C up[0] + mov cy, #0 + bne L(unnorm) + +L(norm): + subs n, n, #1 C set carry as side-effect + beq L(edn) + + ALIGN(16) +L(tpn): sbcs cy, r5, cy + ldr r5, [up], #4 + sub n, n, #1 + mul r9, r4, cy + tst n, n + umull r12, cy, d, r9 + str r9, [rp], #4 + bne L(tpn) + +L(edn): sbc cy, r5, cy + mul r9, r4, cy + str r9, [rp] + pop {r4-r9} + return r14 + +L(unnorm): + rsb tnc, cnt, #32 + mov r5, r5, lsr cnt + subs n, n, #1 C set carry as side-effect + beq L(edu) + + ALIGN(16) +L(tpu): ldr r12, [up], #4 + orr r9, r5, r12, lsl tnc + mov r5, r12, lsr cnt + sbcs cy, r9, cy C critical path ->cy->cy-> + sub n, n, #1 + mul r9, r4, cy C critical path ->cy->r9-> + tst n, n + umull r12, cy, d, r9 C critical path ->r9->cy-> + str r9, [rp], #4 + bne L(tpu) + +L(edu): sbc cy, r5, cy + mul r9, r4, cy + str r9, [rp] + pop {r4-r9} + return r14 +EPILOGUE() + + RODATA +ctz_tab: + .byte 8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 diff --git a/gmp-6.3.0/mpn/arm/gmp-mparam.h b/gmp-6.3.0/mpn/arm/gmp-mparam.h new file mode 100644 index 0000000..87eec3a --- /dev/null +++ b/gmp-6.3.0/mpn/arm/gmp-mparam.h @@ -0,0 +1,127 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1193MHz ARM (gcc55.fsffrance.org) */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 56 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 11 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 71 +#define USE_PREINV_DIVREM_1 1 /* preinv always */ +#define DIVREM_2_THRESHOLD 0 /* preinv always */ +#define DIVEXACT_1_THRESHOLD 0 /* always */ +#define BMOD_1_TO_MOD_1_THRESHOLD 41 + +#define MUL_TOOM22_THRESHOLD 36 +#define MUL_TOOM33_THRESHOLD 125 +#define MUL_TOOM44_THRESHOLD 193 +#define MUL_TOOM6H_THRESHOLD 303 +#define MUL_TOOM8H_THRESHOLD 418 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 125 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 176 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 114 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 129 + +#define SQR_BASECASE_THRESHOLD 12 +#define SQR_TOOM2_THRESHOLD 78 +#define SQR_TOOM3_THRESHOLD 137 +#define SQR_TOOM4_THRESHOLD 212 +#define SQR_TOOM6_THRESHOLD 306 +#define SQR_TOOM8_THRESHOLD 422 + +#define MULMOD_BNM1_THRESHOLD 20 +#define SQRMOD_BNM1_THRESHOLD 26 + +#define MUL_FFT_MODF_THRESHOLD 436 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 436, 5}, { 27, 6}, { 28, 7}, { 15, 6}, \ + { 32, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 256, 9}, { 512,10}, { 1024,11}, { 2048,12}, \ + { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 28 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 404 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 404, 5}, { 13, 4}, { 27, 5}, { 27, 6}, \ + { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \ + { 35, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 47, 8}, \ + { 27, 9}, { 15, 8}, { 39, 9}, { 512,10}, \ + { 1024,11}, { 2048,12}, { 4096,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 26 +#define SQR_FFT_THRESHOLD 3776 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 137 +#define MULLO_MUL_N_THRESHOLD 11479 + +#define DC_DIV_QR_THRESHOLD 150 +#define DC_DIVAPPR_Q_THRESHOLD 494 +#define DC_BDIV_QR_THRESHOLD 148 +#define DC_BDIV_Q_THRESHOLD 345 + +#define INV_MULMOD_BNM1_THRESHOLD 70 +#define INV_NEWTON_THRESHOLD 474 +#define INV_APPR_THRESHOLD 478 + +#define BINV_NEWTON_THRESHOLD 542 +#define REDC_1_TO_REDC_N_THRESHOLD 117 + +#define MU_DIV_QR_THRESHOLD 2089 +#define MU_DIVAPPR_Q_THRESHOLD 2172 +#define MUPI_DIV_QR_THRESHOLD 225 +#define MU_BDIV_QR_THRESHOLD 1528 +#define MU_BDIV_Q_THRESHOLD 2089 + +#define MATRIX22_STRASSEN_THRESHOLD 16 +#define HGCD_THRESHOLD 197 +#define GCD_DC_THRESHOLD 902 +#define GCDEXT_DC_THRESHOLD 650 +#define JACOBI_BASE_METHOD 2 + +#define GET_STR_DC_THRESHOLD 20 +#define GET_STR_PRECOMPUTE_THRESHOLD 39 +#define SET_STR_DC_THRESHOLD 1045 +#define SET_STR_PRECOMPUTE_THRESHOLD 2147 diff --git a/gmp-6.3.0/mpn/arm/invert_limb.asm b/gmp-6.3.0/mpn/arm/invert_limb.asm new file mode 100644 index 0000000..af7502d --- /dev/null +++ b/gmp-6.3.0/mpn/arm/invert_limb.asm @@ -0,0 +1,93 @@ +dnl ARM mpn_invert_limb -- Invert a normalized limb. + +dnl Copyright 2001, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +ASM_START() +PROLOGUE(mpn_invert_limb) + LEA( r2, approx_tab-512) + mov r3, r0, lsr #23 + mov r3, r3, asl #1 + ldrh r3, [r3, r2] + mov r1, r3, asl #17 + mul r12, r3, r3 + umull r3, r2, r12, r0 + sub r1, r1, r2, asl #1 + umull r3, r2, r1, r1 + umull r12, r3, r0, r3 + umull r2, r12, r0, r2 + adds r2, r2, r3 + adc r12, r12, #0 + rsb r1, r12, r1 + mvn r2, r2, lsr #30 + add r2, r2, r1, asl #2 + umull r12, r3, r0, r2 + adds r1, r12, r0 + adc r3, r3, r0 + rsb r0, r3, r2 + return lr +EPILOGUE() + + RODATA + ALIGN(2) +approx_tab: + .short 0xffc0,0xfec0,0xfdc0,0xfcc0,0xfbc0,0xfac0,0xfa00,0xf900 + .short 0xf800,0xf700,0xf640,0xf540,0xf440,0xf380,0xf280,0xf180 + .short 0xf0c0,0xefc0,0xef00,0xee00,0xed40,0xec40,0xeb80,0xeac0 + .short 0xe9c0,0xe900,0xe840,0xe740,0xe680,0xe5c0,0xe500,0xe400 + .short 0xe340,0xe280,0xe1c0,0xe100,0xe040,0xdf80,0xdec0,0xde00 + .short 0xdd40,0xdc80,0xdbc0,0xdb00,0xda40,0xd980,0xd8c0,0xd800 + .short 0xd740,0xd680,0xd600,0xd540,0xd480,0xd3c0,0xd340,0xd280 + .short 0xd1c0,0xd140,0xd080,0xcfc0,0xcf40,0xce80,0xcdc0,0xcd40 + .short 0xcc80,0xcc00,0xcb40,0xcac0,0xca00,0xc980,0xc8c0,0xc840 + .short 0xc780,0xc700,0xc640,0xc5c0,0xc540,0xc480,0xc400,0xc380 + .short 0xc2c0,0xc240,0xc1c0,0xc100,0xc080,0xc000,0xbf80,0xbec0 + .short 0xbe40,0xbdc0,0xbd40,0xbc80,0xbc00,0xbb80,0xbb00,0xba80 + .short 0xba00,0xb980,0xb900,0xb840,0xb7c0,0xb740,0xb6c0,0xb640 + .short 0xb5c0,0xb540,0xb4c0,0xb440,0xb3c0,0xb340,0xb2c0,0xb240 + .short 0xb1c0,0xb140,0xb0c0,0xb080,0xb000,0xaf80,0xaf00,0xae80 + .short 0xae00,0xad80,0xad40,0xacc0,0xac40,0xabc0,0xab40,0xaac0 + .short 0xaa80,0xaa00,0xa980,0xa900,0xa8c0,0xa840,0xa7c0,0xa740 + .short 0xa700,0xa680,0xa600,0xa5c0,0xa540,0xa4c0,0xa480,0xa400 + .short 0xa380,0xa340,0xa2c0,0xa240,0xa200,0xa180,0xa140,0xa0c0 + .short 0xa080,0xa000,0x9f80,0x9f40,0x9ec0,0x9e80,0x9e00,0x9dc0 + .short 0x9d40,0x9d00,0x9c80,0x9c40,0x9bc0,0x9b80,0x9b00,0x9ac0 + .short 0x9a40,0x9a00,0x9980,0x9940,0x98c0,0x9880,0x9840,0x97c0 + .short 0x9780,0x9700,0x96c0,0x9680,0x9600,0x95c0,0x9580,0x9500 + .short 0x94c0,0x9440,0x9400,0x93c0,0x9340,0x9300,0x92c0,0x9240 + .short 0x9200,0x91c0,0x9180,0x9100,0x90c0,0x9080,0x9000,0x8fc0 + .short 0x8f80,0x8f40,0x8ec0,0x8e80,0x8e40,0x8e00,0x8d80,0x8d40 + .short 0x8d00,0x8cc0,0x8c80,0x8c00,0x8bc0,0x8b80,0x8b40,0x8b00 + .short 0x8a80,0x8a40,0x8a00,0x89c0,0x8980,0x8940,0x88c0,0x8880 + .short 0x8840,0x8800,0x87c0,0x8780,0x8740,0x8700,0x8680,0x8640 + .short 0x8600,0x85c0,0x8580,0x8540,0x8500,0x84c0,0x8480,0x8440 + .short 0x8400,0x8380,0x8340,0x8300,0x82c0,0x8280,0x8240,0x8200 + .short 0x81c0,0x8180,0x8140,0x8100,0x80c0,0x8080,0x8040,0x8000 +ASM_END() diff --git a/gmp-6.3.0/mpn/arm/logops_n.asm b/gmp-6.3.0/mpn/arm/logops_n.asm new file mode 100644 index 0000000..7e04165 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/logops_n.asm @@ -0,0 +1,139 @@ +dnl ARM mpn_and_n, mpn_andn_n. mpn_nand_n, etc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb +C and andn ior xor nand iorn nior xnor +C StrongARM ? ? +C XScale ? ? +C Cortex-A7 ? ? +C Cortex-A8 ? ? +C Cortex-A9 2.5-2.72 2.75-3 +C Cortex-A15 2.25 2.75 + +C TODO +C * It seems that 2.25 c/l and 2.75 c/l is possible for A9. +C * Debug popping issue, see comment below. + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +define(`POSTOP') + +ifdef(`OPERATION_and_n',` + define(`func', `mpn_and_n') + define(`LOGOP', `and $1, $2, $3')') +ifdef(`OPERATION_andn_n',` + define(`func', `mpn_andn_n') + define(`LOGOP', `bic $1, $2, $3')') +ifdef(`OPERATION_nand_n',` + define(`func', `mpn_nand_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `and $1, $2, $3')') +ifdef(`OPERATION_ior_n',` + define(`func', `mpn_ior_n') + define(`LOGOP', `orr $1, $2, $3')') +ifdef(`OPERATION_iorn_n',` + define(`func', `mpn_iorn_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `bic $1, $3, $2')') +ifdef(`OPERATION_nior_n',` + define(`func', `mpn_nior_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `orr $1, $2, $3')') +ifdef(`OPERATION_xor_n',` + define(`func', `mpn_xor_n') + define(`LOGOP', `eor $1, $2, $3')') +ifdef(`OPERATION_xnor_n',` + define(`func', `mpn_xnor_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `eor $1, $2, $3')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +ASM_START() +PROLOGUE(func) + push { r8, r9, r10 } + tst n, #1 + beq L(skip1) + ldr r10, [vp], #4 + ldr r12, [up], #4 + LOGOP( r12, r12, r10) + POSTOP( r12) + str r12, [rp], #4 +L(skip1): + tst n, #2 + beq L(skip2) + ldmia vp!, { r10, r12 } + ldmia up!, { r8, r9 } + LOGOP( r8, r8, r10) + LOGOP( r9, r9, r12) + POSTOP( r8) + POSTOP( r9) + stmia rp!, { r8, r9 } +L(skip2): + bics n, n, #3 + beq L(rtn) + push { r4, r5, r6, r7 } + + ldmia vp!, { r8, r9, r10, r12 } + b L(mid) + +L(top): ldmia vp!, { r8, r9, r10, r12 } + POSTOP( r4) + POSTOP( r5) + POSTOP( r6) + POSTOP( r7) + stmia rp!, { r4, r5, r6, r7 } +L(mid): sub n, n, #4 + ldmia up!, { r4, r5, r6, r7 } + teq n, #0 + LOGOP( r4, r4, r8) + LOGOP( r5, r5, r9) + LOGOP( r6, r6, r10) + LOGOP( r7, r7, r12) + bne L(top) + + POSTOP( r4) + POSTOP( r5) + POSTOP( r6) + POSTOP( r7) + stmia rp!, { r4, r5, r6, r7 } + + pop { r4, r5, r6, r7 } C popping r8-r10 here strangely fails + +L(rtn): pop { r8, r9, r10 } + return r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/lshift.asm b/gmp-6.3.0/mpn/arm/lshift.asm new file mode 100644 index 0000000..1d5ce0a --- /dev/null +++ b/gmp-6.3.0/mpn/arm/lshift.asm @@ -0,0 +1,88 @@ +dnl ARM mpn_lshift. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 3.5 +C Cortex-A15 ? + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`cnt', `r3') +define(`tnc', `r12') + +ASM_START() +PROLOGUE(mpn_lshift) + add up, up, n, lsl #2 + push {r4, r6, r7, r8} + ldr r4, [up, #-4]! + add rp, rp, n, lsl #2 + rsb tnc, cnt, #32 + + mov r7, r4, lsl cnt + tst n, #1 + beq L(evn) C n even + +L(odd): subs n, n, #2 + bcc L(1) C n = 1 + ldr r8, [up, #-4]! + b L(mid) + +L(evn): ldr r6, [up, #-4]! + subs n, n, #2 + beq L(end) + +L(top): ldr r8, [up, #-4]! + orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mov r7, r6, lsl cnt +L(mid): ldr r6, [up, #-4]! + orr r7, r7, r8, lsr tnc + str r7, [rp, #-4]! + mov r7, r8, lsl cnt + subs n, n, #2 + bgt L(top) + +L(end): orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mov r7, r6, lsl cnt +L(1): str r7, [rp, #-4] + mov r0, r4, lsr tnc + pop {r4, r6, r7, r8} + return r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/lshiftc.asm b/gmp-6.3.0/mpn/arm/lshiftc.asm new file mode 100644 index 0000000..e5b52df --- /dev/null +++ b/gmp-6.3.0/mpn/arm/lshiftc.asm @@ -0,0 +1,95 @@ +dnl ARM mpn_lshiftc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 4.0 +C Cortex-A15 ? + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`cnt', `r3') +define(`tnc', `r12') + +ASM_START() +PROLOGUE(mpn_lshiftc) + add up, up, n, lsl #2 + push {r4, r6, r7, r8} + ldr r4, [up, #-4]! + add rp, rp, n, lsl #2 + rsb tnc, cnt, #32 + mvn r6, r4 + + mov r7, r6, lsl cnt + tst n, #1 + beq L(evn) C n even + +L(odd): subs n, n, #2 + bcc L(1) C n = 1 + ldr r8, [up, #-4]! + mvn r8, r8 + b L(mid) + +L(evn): ldr r6, [up, #-4]! + mvn r6, r6 + subs n, n, #2 + beq L(end) + +L(top): ldr r8, [up, #-4]! + orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mvn r8, r8 + mov r7, r6, lsl cnt +L(mid): ldr r6, [up, #-4]! + orr r7, r7, r8, lsr tnc + str r7, [rp, #-4]! + mvn r6, r6 + mov r7, r8, lsl cnt + subs n, n, #2 + bgt L(top) + +L(end): orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mov r7, r6, lsl cnt +L(1): mvn r6, #0 + orr r7, r7, r6, lsr tnc + str r7, [rp, #-4] + mov r0, r4, lsr tnc + pop {r4, r6, r7, r8} + return r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/mod_34lsub1.asm b/gmp-6.3.0/mpn/arm/mod_34lsub1.asm new file mode 100644 index 0000000..596cd3c --- /dev/null +++ b/gmp-6.3.0/mpn/arm/mod_34lsub1.asm @@ -0,0 +1,124 @@ +dnl ARM mpn_mod_34lsub1 -- remainder modulo 2^24-1. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A5 2.67 +C Cortex-A7 2.35 +C Cortex-A8 2.0 +C Cortex-A9 1.33 +C Cortex-A15 1.33 +C Cortex-A17 3.34 +C Cortex-A53 2.0 + +define(`ap', r0) +define(`n', r1) + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) + +C TODO +C * Write cleverer summation code. +C * Consider loading 6 64-bit aligned registers at a time, to approach 1 c/l. + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mod_34lsub1) + push { r4, r5, r6, r7 } + + subs n, n, #3 + mov r7, #0 + blt L(le2) C n <= 2 + + ldmia ap!, { r2, r3, r12 } + subs n, n, #3 + blt L(sum) C n <= 5 + cmn r0, #0 C clear carry + sub n, n, #3 + b L(mid) + +L(top): adcs r2, r2, r4 + adcs r3, r3, r5 + adcs r12, r12, r6 +L(mid): ldmia ap!, { r4, r5, r6 } + tst n, n + sub n, n, #3 + bpl L(top) + + add n, n, #3 + + adcs r2, r2, r4 + adcs r3, r3, r5 + adcs r12, r12, r6 + movcs r7, #1 C r7 <= 1 + +L(sum): cmn n, #2 + movlo r4, #0 + ldrhs r4, [ap], #4 + movls r5, #0 + ldrhi r5, [ap], #4 + + adds r2, r2, r4 + adcs r3, r3, r5 + adcs r12, r12, #0 + adc r7, r7, #0 C r7 <= 2 + +L(sum2): + bic r0, r2, #0xff000000 + add r0, r0, r2, lsr #24 + add r0, r0, r7 + + mov r7, r3, lsl #8 + bic r1, r7, #0xff000000 + add r0, r0, r1 + add r0, r0, r3, lsr #16 + + mov r7, r12, lsl #16 + bic r1, r7, #0xff000000 + add r0, r0, r1 + add r0, r0, r12, lsr #8 + + pop { r4, r5, r6, r7 } + return lr + +L(le2): cmn n, #1 + bne L(1) + ldmia ap!, { r2, r3 } + mov r12, #0 + b L(sum2) +L(1): ldr r2, [ap] + bic r0, r2, #0xff000000 + add r0, r0, r2, lsr #24 + pop { r4, r5, r6, r7 } + return lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/mode1o.asm b/gmp-6.3.0/mpn/arm/mode1o.asm new file mode 100644 index 0000000..63a7f36 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/mode1o.asm @@ -0,0 +1,92 @@ +dnl ARM mpn_modexact_1c_odd + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 10 +C Cortex-A15 9 + +C Architecture requirements: +C v5 - +C v5t - +C v5te - +C v6 - +C v6t2 - +C v7a - + +define(`up', `r0') +define(`n', `r1') +define(`d', `r2') +define(`cy', `r3') + + .protected binvert_limb_table +ASM_START() +PROLOGUE(mpn_modexact_1c_odd) + stmfd sp!, {r4, r5} + + LEA( r4, binvert_limb_table) + + ldr r5, [up], #4 C up[0] + + and r12, d, #254 + ldrb r4, [r4, r12, lsr #1] + mul r12, r4, r4 + mul r12, d, r12 + rsb r12, r12, r4, asl #1 + mul r4, r12, r12 + mul r4, d, r4 + rsb r4, r4, r12, asl #1 C r4 = inverse + + subs n, n, #1 C set carry as side-effect + beq L(end) + +L(top): sbcs cy, r5, cy + ldr r5, [up], #4 + sub n, n, #1 + mul r12, r4, cy + tst n, n + umull r12, cy, d, r12 + bne L(top) + +L(end): sbcs cy, r5, cy + mul r12, r4, cy + umull r12, r0, d, r12 + addcc r0, r0, #1 + + ldmfd sp!, {r4, r5} + return r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/mul_1.asm b/gmp-6.3.0/mpn/arm/mul_1.asm new file mode 100644 index 0000000..f7bc1bc --- /dev/null +++ b/gmp-6.3.0/mpn/arm/mul_1.asm @@ -0,0 +1,94 @@ +dnl ARM mpn_mul_1 -- Multiply a limb vector with a limb and store the result +dnl in a second limb vector. +dnl Contributed by Robert Harley. + +dnl Copyright 1998, 2000, 2001, 2003, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM 6-8 +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 4.75 +C Cortex-A15 ? + +C We should rewrite this along the lines of addmul_1.asm. That should save a +C cycle on StrongARM, and several cycles on XScale. + +define(`rp',`r0') +define(`up',`r1') +define(`n',`r2') +define(`vl',`r3') + + +ASM_START() +PROLOGUE(mpn_mul_1) + stmfd sp!, { r8, r9, lr } + ands r12, n, #1 + beq L(skip1) + ldr lr, [up], #4 + umull r9, r12, lr, vl + str r9, [rp], #4 +L(skip1): + tst n, #2 + beq L(skip2) + mov r8, r12 + ldmia up!, { r12, lr } + mov r9, #0 + umlal r8, r9, r12, vl + mov r12, #0 + umlal r9, r12, lr, vl + stmia rp!, { r8, r9 } +L(skip2): + bics n, n, #3 + beq L(rtn) + stmfd sp!, { r6, r7 } + +L(top): mov r6, r12 + ldmia up!, { r8, r9, r12, lr } + ldr r7, [rp, #12] C cache allocate + mov r7, #0 + umlal r6, r7, r8, vl + mov r8, #0 + umlal r7, r8, r9, vl + mov r9, #0 + umlal r8, r9, r12, vl + mov r12, #0 + umlal r9, r12, lr, vl + subs n, n, #4 + stmia rp!, { r6, r7, r8, r9 } + bne L(top) + + ldmfd sp!, { r6, r7 } + +L(rtn): mov r0, r12 + ldmfd sp!, { r8, r9, pc } +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/neon/README b/gmp-6.3.0/mpn/arm/neon/README new file mode 100644 index 0000000..79e3b48 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/neon/README @@ -0,0 +1,2 @@ +This directory contains Neon code which runs and is efficient on all +ARM CPUs which support Neon. diff --git a/gmp-6.3.0/mpn/arm/neon/hamdist.asm b/gmp-6.3.0/mpn/arm/neon/hamdist.asm new file mode 100644 index 0000000..2320896 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/neon/hamdist.asm @@ -0,0 +1,194 @@ +dnl ARM Neon mpn_hamdist -- mpn bit hamming distance. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.89 +C Cortex-A15 0.95 + +C TODO +C * Explore using vldr and vldm. Does it help on A9? (These loads do +C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for +C popcount. Except perhaps also for popcount for the edge loads.) +C * Arrange to align the pointer, if that helps performance. Use the same +C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry +C valgrind!) +C * Explore if explicit align directives, e.g., "[ptr:128]" help. +C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. + +C INPUT PARAMETERS +define(`ap', r0) +define(`bp', r1) +define(`n', r2) + +C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end +C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or +C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which +C can be represented as a 8-bit ARM constant. +C +define(`chunksize',0x3f80) + +ASM_START() +PROLOGUE(mpn_hamdist) + + cmp n, #chunksize + bhi L(gt16k) + +L(lt16k): + vmov.i64 q8, #0 C clear summation register + vmov.i64 q9, #0 C clear summation register + + tst n, #1 + beq L(xxx0) + vmov.i64 d0, #0 + vmov.i64 d20, #0 + sub n, n, #1 + vld1.32 {d0[0]}, [ap]! C load 1 limb + vld1.32 {d20[0]}, [bp]! C load 1 limb + veor d0, d0, d20 + vcnt.8 d24, d0 + vpadal.u8 d16, d24 C d16/q8 = 0; could just splat + +L(xxx0):tst n, #2 + beq L(xx00) + sub n, n, #2 + vld1.32 {d0}, [ap]! C load 2 limbs + vld1.32 {d20}, [bp]! C load 2 limbs + veor d0, d0, d20 + vcnt.8 d24, d0 + vpadal.u8 d16, d24 + +L(xx00):tst n, #4 + beq L(x000) + sub n, n, #4 + vld1.32 {q0}, [ap]! C load 4 limbs + vld1.32 {q10}, [bp]! C load 4 limbs + veor q0, q0, q10 + vcnt.8 q12, q0 + vpadal.u8 q8, q12 + +L(x000):tst n, #8 + beq L(0000) + + subs n, n, #8 + vld1.32 {q0,q1}, [ap]! C load 8 limbs + vld1.32 {q10,q11}, [bp]! C load 8 limbs + bls L(sum) + +L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs + vld1.32 {q14,q15}, [bp]! C load 8 limbs + veor q0, q0, q10 + veor q1, q1, q11 + sub n, n, #8 + vcnt.8 q12, q0 + vcnt.8 q13, q1 + b L(mid) + +L(0000):subs n, n, #16 + blo L(e0) + + vld1.32 {q2,q3}, [ap]! C load 8 limbs + vld1.32 {q0,q1}, [ap]! C load 8 limbs + vld1.32 {q14,q15}, [bp]! C load 8 limbs + vld1.32 {q10,q11}, [bp]! C load 8 limbs + veor q2, q2, q14 + veor q3, q3, q15 + vcnt.8 q12, q2 + vcnt.8 q13, q3 + subs n, n, #16 + blo L(end) + +L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs + vld1.32 {q14,q15}, [bp]! C load 8 limbs + veor q0, q0, q10 + veor q1, q1, q11 + vpadal.u8 q8, q12 + vcnt.8 q12, q0 + vpadal.u8 q9, q13 + vcnt.8 q13, q1 +L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs + vld1.32 {q10,q11}, [bp]! C load 8 limbs + veor q2, q2, q14 + veor q3, q3, q15 + subs n, n, #16 + vpadal.u8 q8, q12 + vcnt.8 q12, q2 + vpadal.u8 q9, q13 + vcnt.8 q13, q3 + bhs L(top) + +L(end): vpadal.u8 q8, q12 + vpadal.u8 q9, q13 +L(sum): veor q0, q0, q10 + veor q1, q1, q11 + vcnt.8 q12, q0 + vcnt.8 q13, q1 + vpadal.u8 q8, q12 + vpadal.u8 q9, q13 + vadd.i16 q8, q8, q9 + C we have 8 16-bit counts +L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts + vpaddl.u32 q8, q8 C we have 2 64-bit counts + vmov.32 r0, d16[0] + vmov.32 r1, d17[0] + add r0, r0, r1 + bx lr + +C Code for large count. Splits operand and calls above code. +define(`ap2', r5) +define(`bp2', r6) +L(gt16k): + push {r4,r5,r6,r14} + mov ap2, ap + mov bp2, bp + mov r3, n C full count + mov r4, #0 C total sum + +1: mov n, #chunksize C count for this invocation + bl L(lt16k) C could jump deep inside code + add ap2, ap2, #chunksize*4 C point at next chunk + add bp2, bp2, #chunksize*4 C point at next chunk + add r4, r4, r0 + mov ap, ap2 C put chunk pointer in place for call + mov bp, bp2 C put chunk pointer in place for call + sub r3, r3, #chunksize + cmp r3, #chunksize + bhi 1b + + mov n, r3 C count for final invocation + bl L(lt16k) + add r0, r4, r0 + pop {r4,r5,r6,pc} +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/neon/lorrshift.asm b/gmp-6.3.0/mpn/arm/neon/lorrshift.asm new file mode 100644 index 0000000..7ebc780 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/neon/lorrshift.asm @@ -0,0 +1,279 @@ +dnl ARM Neon mpn_lshift and mpn_rshift. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C StrongARM - - +C XScale - - +C Cortex-A7 ? ? +C Cortex-A8 ? ? +C Cortex-A9 3 3 Y +C Cortex-A15 1.5 1.5 Y + + +C We read 64 bits at a time at 32-bit aligned addresses, and except for the +C first and last store, we write using 64-bit aligned addresses. All shifting +C is done on 64-bit words in 'extension' registers. +C +C It should be possible to read also using 64-bit alignment, by manipulating +C the shift count for unaligned operands. Not done, since it does not seem to +C matter for A9 or A15. +C +C This will not work in big-endian mode. + +C TODO +C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts, +C which might make it tricky. +C * Clean up and simplify. +C * Consider sharing most of the code for lshift and rshift, since the feed-in +C code, the loop, and most of the wind-down code are identical. +C * Replace the basecase code with code using 'extension' registers. +C * Optimise. It is not clear that this loop insn permutation is optimal for +C either A9 or A15. + +C INPUT PARAMETERS +define(`rp', `r0') +define(`ap', `r1') +define(`n', `r2') +define(`cnt', `r3') + +ifdef(`OPERATION_lshift',` + define(`IFLSH', `$1') + define(`IFRSH', `') + define(`X',`0') + define(`Y',`1') + define(`func',`mpn_lshift') +') +ifdef(`OPERATION_rshift',` + define(`IFLSH', `') + define(`IFRSH', `$1') + define(`X',`1') + define(`Y',`0') + define(`func',`mpn_rshift') +') + +MULFUNC_PROLOGUE(mpn_lshift mpn_rshift) + +ASM_START(neon) + TEXT + ALIGN(64) +PROLOGUE(func) +IFLSH(` mov r12, n, lsl #2 ') +IFLSH(` add rp, rp, r12 ') +IFLSH(` add ap, ap, r12 ') + + cmp n, #4 C SIMD code n limit + ble L(base) + +ifdef(`OPERATION_lshift',` + vdup.32 d6, r3 C left shift count is positive + sub r3, r3, #64 C right shift count is negative + vdup.32 d7, r3 + mov r12, #-8') C lshift pointer update offset +ifdef(`OPERATION_rshift',` + rsb r3, r3, #0 C right shift count is negative + vdup.32 d6, r3 + add r3, r3, #64 C left shift count is positive + vdup.32 d7, r3 + mov r12, #8') C rshift pointer update offset + +IFLSH(` sub ap, ap, #8 ') + vld1.32 {d19}, [ap], r12 C load initial 2 limbs + vshl.u64 d18, d19, d7 C retval + + tst rp, #4 C is rp 64-bit aligned already? + beq L(rp_aligned) C yes, skip +IFLSH(` add ap, ap, #4 ') C move back ap pointer +IFRSH(` sub ap, ap, #4 ') C move back ap pointer + vshl.u64 d4, d19, d6 + sub n, n, #1 C first limb handled +IFLSH(` sub rp, rp, #4 ') + vst1.32 {d4[Y]}, [rp]IFRSH(!) C store first limb, rp gets aligned + vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2] + +L(rp_aligned): +IFLSH(` sub rp, rp, #8 ') + subs n, n, #6 + blt L(two_or_three_more) + tst n, #2 + beq L(2) + +L(1): vld1.32 {d17}, [ap], r12 + vshl.u64 d5, d19, d6 + vld1.32 {d16}, [ap], r12 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + sub n, n, #2 + b L(mid) + +L(2): vld1.32 {d16}, [ap], r12 + vshl.u64 d4, d19, d6 + vld1.32 {d17}, [ap], r12 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + subs n, n, #4 + blt L(end) + +L(top): vld1.32 {d16}, [ap], r12 + vorr d2, d4, d1 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + vst1.32 {d2}, [rp:64], r12 +L(mid): vld1.32 {d17}, [ap], r12 + vorr d3, d5, d0 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + vst1.32 {d3}, [rp:64], r12 + subs n, n, #4 + bge L(top) + +L(end): tst n, #1 + beq L(evn) + + vorr d2, d4, d1 + vst1.32 {d2}, [rp:64], r12 + b L(cj1) + +L(evn): vorr d2, d4, d1 + vshl.u64 d0, d17, d7 + vshl.u64 d16, d17, d6 + vst1.32 {d2}, [rp:64], r12 + vorr d2, d5, d0 + b L(cj2) + +C Load last 2 - 3 limbs, store last 4 - 5 limbs +L(two_or_three_more): + tst n, #1 + beq L(l2) + +L(l3): vshl.u64 d5, d19, d6 + vld1.32 {d17}, [ap], r12 +L(cj1): veor d16, d16, d16 +IFLSH(` add ap, ap, #4 ') + vld1.32 {d16[Y]}, [ap], r12 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + vorr d3, d5, d0 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + vst1.32 {d3}, [rp:64], r12 + vorr d2, d4, d1 + vst1.32 {d2}, [rp:64], r12 +IFLSH(` add rp, rp, #4 ') + vst1.32 {d5[Y]}, [rp] + vmov.32 r0, d18[X] + bx lr + +L(l2): vld1.32 {d16}, [ap], r12 + vshl.u64 d4, d19, d6 + vshl.u64 d1, d16, d7 + vshl.u64 d16, d16, d6 + vorr d2, d4, d1 +L(cj2): vst1.32 {d2}, [rp:64], r12 + vst1.32 {d16}, [rp] + vmov.32 r0, d18[X] + bx lr + + +define(`tnc', `r12') +L(base): + push {r4, r6, r7, r8} +ifdef(`OPERATION_lshift',` + ldr r4, [ap, #-4]! + rsb tnc, cnt, #32 + + mov r7, r4, lsl cnt + tst n, #1 + beq L(ev) C n even + +L(od): subs n, n, #2 + bcc L(ed1) C n = 1 + ldr r8, [ap, #-4]! + b L(md) C n = 3 + +L(ev): ldr r6, [ap, #-4]! + subs n, n, #2 + beq L(ed) C n = 3 + C n = 4 +L(tp): ldr r8, [ap, #-4]! + orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mov r7, r6, lsl cnt +L(md): ldr r6, [ap, #-4]! + orr r7, r7, r8, lsr tnc + str r7, [rp, #-4]! + mov r7, r8, lsl cnt + +L(ed): orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mov r7, r6, lsl cnt +L(ed1): str r7, [rp, #-4] + mov r0, r4, lsr tnc +') +ifdef(`OPERATION_rshift',` + ldr r4, [ap] + rsb tnc, cnt, #32 + + mov r7, r4, lsr cnt + tst n, #1 + beq L(ev) C n even + +L(od): subs n, n, #2 + bcc L(ed1) C n = 1 + ldr r8, [ap, #4]! + b L(md) C n = 3 + +L(ev): ldr r6, [ap, #4]! + subs n, n, #2 + beq L(ed) C n = 2 + C n = 4 + +L(tp): ldr r8, [ap, #4]! + orr r7, r7, r6, lsl tnc + str r7, [rp], #4 + mov r7, r6, lsr cnt +L(md): ldr r6, [ap, #4]! + orr r7, r7, r8, lsl tnc + str r7, [rp], #4 + mov r7, r8, lsr cnt + +L(ed): orr r7, r7, r6, lsl tnc + str r7, [rp], #4 + mov r7, r6, lsr cnt +L(ed1): str r7, [rp], #4 + mov r0, r4, lsl tnc +') + pop {r4, r6, r7, r8} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/neon/lshiftc.asm b/gmp-6.3.0/mpn/arm/neon/lshiftc.asm new file mode 100644 index 0000000..f1bf0de --- /dev/null +++ b/gmp-6.3.0/mpn/arm/neon/lshiftc.asm @@ -0,0 +1,242 @@ +dnl ARM Neon mpn_lshiftc. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb cycles/limb good +C aligned unaligned best seen for cpu? +C StrongARM - - +C XScale - - +C Cortex-A7 ? ? +C Cortex-A8 ? ? +C Cortex-A9 3.5 3.5 Y +C Cortex-A15 1.75 1.75 Y + + +C We read 64 bits at a time at 32-bit aligned addresses, and except for the +C first and last store, we write using 64-bit aligned addresses. All shifting +C is done on 64-bit words in 'extension' registers. +C +C It should be possible to read also using 64-bit alignment, by manipulating +C the shift count for unaligned operands. Not done, since it does not seem to +C matter for A9 or A15. +C +C This will not work in big-endian mode. + +C TODO +C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts, +C which might make it tricky. +C * Clean up and simplify. +C * Consider sharing most of the code for lshift and rshift, since the feed-in +C code, the loop, and most of the wind-down code are identical. +C * Replace the basecase code with code using 'extension' registers. +C * Optimise. It is not clear that this loop insn permutation is optimal for +C either A9 or A15. + +C INPUT PARAMETERS +define(`rp', `r0') +define(`ap', `r1') +define(`n', `r2') +define(`cnt', `r3') + +ASM_START(neon) + TEXT + ALIGN(64) +PROLOGUE(mpn_lshiftc) + mov r12, n, lsl #2 + add rp, rp, r12 + add ap, ap, r12 + + cmp n, #4 C SIMD code n limit + ble L(base) + + vdup.32 d6, r3 C left shift count is positive + sub r3, r3, #64 C right shift count is negative + vdup.32 d7, r3 + mov r12, #-8 C lshift pointer update offset + + sub ap, ap, #8 + vld1.32 {d19}, [ap], r12 C load initial 2 limbs + vshl.u64 d18, d19, d7 C retval + + tst rp, #4 C is rp 64-bit aligned already? + beq L(rp_aligned) C yes, skip + vmvn d19, d19 + add ap, ap, #4 C move back ap pointer + vshl.u64 d4, d19, d6 + sub n, n, #1 C first limb handled + sub rp, rp, #4 + vst1.32 {d4[1]}, [rp] C store first limb, rp gets aligned + vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2] + +L(rp_aligned): + sub rp, rp, #8 + subs n, n, #6 + vmvn d19, d19 + blt L(two_or_three_more) + tst n, #2 + beq L(2) + +L(1): vld1.32 {d17}, [ap], r12 + vshl.u64 d5, d19, d6 + vmvn d17, d17 + vld1.32 {d16}, [ap], r12 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + sub n, n, #2 + b L(mid) + +L(2): vld1.32 {d16}, [ap], r12 + vshl.u64 d4, d19, d6 + vmvn d16, d16 + vld1.32 {d17}, [ap], r12 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + subs n, n, #4 + blt L(end) + +L(top): vmvn d17, d17 + vld1.32 {d16}, [ap], r12 + vorr d2, d4, d1 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + vst1.32 {d2}, [rp:64], r12 +L(mid): vmvn d16, d16 + vld1.32 {d17}, [ap], r12 + vorr d3, d5, d0 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + vst1.32 {d3}, [rp:64], r12 + subs n, n, #4 + bge L(top) + +L(end): tst n, #1 + beq L(evn) + + vorr d2, d4, d1 + vst1.32 {d2}, [rp:64], r12 + b L(cj1) + +L(evn): vmvn d17, d17 + vorr d2, d4, d1 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + vst1.32 {d2}, [rp:64], r12 + vmov.u8 d17, #255 + vorr d2, d5, d0 + vshl.u64 d0, d17, d7 + vorr d3, d4, d0 + b L(cj2) + +C Load last 2 - 3 limbs, store last 4 - 5 limbs +L(two_or_three_more): + tst n, #1 + beq L(l2) + +L(l3): vshl.u64 d5, d19, d6 + vld1.32 {d17}, [ap], r12 +L(cj1): vmov.u8 d16, #0 + add ap, ap, #4 + vmvn d17, d17 + vld1.32 {d16[1]}, [ap], r12 + vshl.u64 d0, d17, d7 + vshl.u64 d4, d17, d6 + vmvn d16, d16 + vorr d3, d5, d0 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + vst1.32 {d3}, [rp:64], r12 + vorr d2, d4, d1 + vst1.32 {d2}, [rp:64], r12 + add rp, rp, #4 + vst1.32 {d5[1]}, [rp] + vmov.32 r0, d18[0] + bx lr + +L(l2): vld1.32 {d16}, [ap], r12 + vshl.u64 d4, d19, d6 + vmvn d16, d16 + vshl.u64 d1, d16, d7 + vshl.u64 d5, d16, d6 + vmov.u8 d17, #255 + vorr d2, d4, d1 + vshl.u64 d0, d17, d7 + vorr d3, d5, d0 +L(cj2): vst1.32 {d2}, [rp:64], r12 + vst1.32 {d3}, [rp] + vmov.32 r0, d18[0] + bx lr + + +define(`tnc', `r12') +L(base): + push {r4, r6, r7, r8} + ldr r4, [ap, #-4]! + rsb tnc, cnt, #32 + mvn r6, r4 + + mov r7, r6, lsl cnt + tst n, #1 + beq L(ev) C n even + +L(od): subs n, n, #2 + bcc L(ed1) C n = 1 + ldr r8, [ap, #-4]! + mvn r8, r8 + b L(md) C n = 3 + +L(ev): ldr r6, [ap, #-4]! + mvn r6, r6 + subs n, n, #2 + beq L(ed) C n = 3 + C n = 4 +L(tp): ldr r8, [ap, #-4]! + orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mvn r8, r8 + mov r7, r6, lsl cnt +L(md): ldr r6, [ap, #-4]! + orr r7, r7, r8, lsr tnc + str r7, [rp, #-4]! + mvn r6, r6 + mov r7, r8, lsl cnt + +L(ed): orr r7, r7, r6, lsr tnc + str r7, [rp, #-4]! + mov r7, r6, lsl cnt +L(ed1): mvn r6, #0 + orr r7, r7, r6, lsr tnc + str r7, [rp, #-4] + mov r0, r4, lsr tnc + pop {r4, r6, r7, r8} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/neon/popcount.asm b/gmp-6.3.0/mpn/arm/neon/popcount.asm new file mode 100644 index 0000000..2f8f9af --- /dev/null +++ b/gmp-6.3.0/mpn/arm/neon/popcount.asm @@ -0,0 +1,166 @@ +dnl ARM Neon mpn_popcount -- mpn bit population count. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.125 +C Cortex-A15 0.56 + +C TODO +C * Explore using vldr and vldm. Does it help on A9? (These loads do +C 64-bits-at-a-time, which will mess up in big-endian mode. Except not for +C popcount. Except perhaps also for popcount for the edge loads.) +C * Arrange to align the pointer, if that helps performance. Use the same +C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry +C valgrind!) +C * Explore if explicit align directives, e.g., "[ptr:128]" help. +C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. + +C INPUT PARAMETERS +define(`ap', r0) +define(`n', r1) + +C We sum into 16 16-bit counters in q8,q9, but at the end we sum them and end +C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or +C (8*2^16-1)/32 = 0x3fff limbs. We use a chunksize close to that, but which +C can be represented as a 8-bit ARM constant. +C +define(`chunksize',0x3f80) + +ASM_START() +PROLOGUE(mpn_popcount) + + cmp n, #chunksize + bhi L(gt16k) + +L(lt16k): + vmov.i64 q8, #0 C clear summation register + vmov.i64 q9, #0 C clear summation register + + tst n, #1 + beq L(xxx0) + vmov.i64 d0, #0 + sub n, n, #1 + vld1.32 {d0[0]}, [ap]! C load 1 limb + vcnt.8 d24, d0 + vpadal.u8 d16, d24 C d16/q8 = 0; could just splat + +L(xxx0):tst n, #2 + beq L(xx00) + sub n, n, #2 + vld1.32 {d0}, [ap]! C load 2 limbs + vcnt.8 d24, d0 + vpadal.u8 d16, d24 + +L(xx00):tst n, #4 + beq L(x000) + sub n, n, #4 + vld1.32 {q0}, [ap]! C load 4 limbs + vcnt.8 q12, q0 + vpadal.u8 q8, q12 + +L(x000):tst n, #8 + beq L(0000) + + subs n, n, #8 + vld1.32 {q0,q1}, [ap]! C load 8 limbs + bls L(sum) + +L(gt8): vld1.32 {q2,q3}, [ap]! C load 8 limbs + sub n, n, #8 + vcnt.8 q12, q0 + vcnt.8 q13, q1 + b L(mid) + +L(0000):subs n, n, #16 + blo L(e0) + + vld1.32 {q2,q3}, [ap]! C load 8 limbs + vld1.32 {q0,q1}, [ap]! C load 8 limbs + vcnt.8 q12, q2 + vcnt.8 q13, q3 + subs n, n, #16 + blo L(end) + +L(top): vld1.32 {q2,q3}, [ap]! C load 8 limbs + vpadal.u8 q8, q12 + vcnt.8 q12, q0 + vpadal.u8 q9, q13 + vcnt.8 q13, q1 +L(mid): vld1.32 {q0,q1}, [ap]! C load 8 limbs + subs n, n, #16 + vpadal.u8 q8, q12 + vcnt.8 q12, q2 + vpadal.u8 q9, q13 + vcnt.8 q13, q3 + bhs L(top) + +L(end): vpadal.u8 q8, q12 + vpadal.u8 q9, q13 +L(sum): vcnt.8 q12, q0 + vcnt.8 q13, q1 + vpadal.u8 q8, q12 + vpadal.u8 q9, q13 + vadd.i16 q8, q8, q9 + C we have 8 16-bit counts +L(e0): vpaddl.u16 q8, q8 C we have 4 32-bit counts + vpaddl.u32 q8, q8 C we have 2 64-bit counts + vmov.32 r0, d16[0] + vmov.32 r1, d17[0] + add r0, r0, r1 + bx lr + +C Code for large count. Splits operand and calls above code. +define(`ap2', r2) C caller-saves reg not used above +L(gt16k): + push {r4,r14} + mov ap2, ap + mov r3, n C full count + mov r4, #0 C total sum + +1: mov n, #chunksize C count for this invocation + bl L(lt16k) C could jump deep inside code + add ap2, ap2, #chunksize*4 C point at next chunk + add r4, r4, r0 + mov ap, ap2 C put chunk pointer in place for call + sub r3, r3, #chunksize + cmp r3, #chunksize + bhi 1b + + mov n, r3 C count for final invocation + bl L(lt16k) + add r0, r4, r0 + pop {r4,pc} +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm b/gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm new file mode 100644 index 0000000..69fceb0 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/neon/sec_tabselect.asm @@ -0,0 +1,140 @@ +dnl ARM Neon mpn_sec_tabselect. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2011-2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.15 +C Cortex-A15 0.65 + +define(`rp', `r0') +define(`tp', `r1') +define(`n', `r2') +define(`nents', `r3') +C define(`which', on stack) + +define(`i', `r4') +define(`j', `r5') + +define(`maskq', `q10') +define(`maskd', `d20') + +ASM_START() +PROLOGUE(mpn_sec_tabselect) + push {r4-r5} + + add r4, sp, #8 + vld1.32 {d30[], d31[]}, [r4] C 4 `which' copies + vmov.i32 q14, #1 C 4 copies of 1 + + subs j, n, #8 + bmi L(outer_end) + +L(outer_top): + mov i, nents + mov r12, tp C preserve tp + veor q13, q13, q13 C 4 counter copies + veor q2, q2, q2 + veor q3, q3, q3 + ALIGN(16) +L(top): vceq.i32 maskq, q13, q15 C compare idx copies to `which' copies + vld1.32 {q0,q1}, [tp] + vadd.i32 q13, q13, q14 + vbit q2, q0, maskq + vbit q3, q1, maskq + add tp, tp, n, lsl #2 + subs i, i, #1 + bne L(top) + vst1.32 {q2,q3}, [rp]! + add tp, r12, #32 C restore tp, point to next slice + subs j, j, #8 + bpl L(outer_top) +L(outer_end): + + tst n, #4 + beq L(b0xx) +L(b1xx):mov i, nents + mov r12, tp + veor q13, q13, q13 + veor q2, q2, q2 + ALIGN(16) +L(tp4): vceq.i32 maskq, q13, q15 + vld1.32 {q0}, [tp] + vadd.i32 q13, q13, q14 + vbit q2, q0, maskq + add tp, tp, n, lsl #2 + subs i, i, #1 + bne L(tp4) + vst1.32 {q2}, [rp]! + add tp, r12, #16 + +L(b0xx):tst n, #2 + beq L(b00x) +L(b01x):mov i, nents + mov r12, tp + veor d26, d26, d26 + veor d4, d4, d4 + ALIGN(16) +L(tp2): vceq.i32 maskd, d26, d30 + vld1.32 {d0}, [tp] + vadd.i32 d26, d26, d28 + vbit d4, d0, maskd + add tp, tp, n, lsl #2 + subs i, i, #1 + bne L(tp2) + vst1.32 {d4}, [rp]! + add tp, r12, #8 + +L(b00x):tst n, #1 + beq L(b000) +L(b001):mov i, nents + mov r12, tp + veor d26, d26, d26 + veor d4, d4, d4 + ALIGN(16) +L(tp1): vceq.i32 maskd, d26, d30 + vld1.32 {d0[0]}, [tp] + vadd.i32 d26, d26, d28 + vbit d4, d0, maskd + add tp, tp, n, lsl #2 + subs i, i, #1 + bne L(tp1) + vst1.32 {d4[0]}, [rp] + +L(b000):pop {r4-r5} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/rsh1aors_n.asm b/gmp-6.3.0/mpn/arm/rsh1aors_n.asm new file mode 100644 index 0000000..f2e3006 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/rsh1aors_n.asm @@ -0,0 +1,124 @@ +dnl ARM mpn_rsh1add_n and mpn_rsh1sub_n. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 3.64-3.7 +C Cortex-A15 2.5 + +C TODO +C * Not optimised. + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +ifdef(`OPERATION_rsh1add_n', ` + define(`ADDSUB', adds) + define(`ADDSUBC', adcs) + define(`RSTCY', `cmn $1, $1') + define(`func', mpn_rsh1add_n) + define(`func_nc', mpn_rsh1add_nc)') +ifdef(`OPERATION_rsh1sub_n', ` + define(`ADDSUB', subs) + define(`ADDSUBC', sbcs) + define(`RSTCY', + `mvn $2, #0x80000000 + cmp $2, $1') + define(`func', mpn_rsh1sub_n) + define(`func_nc', mpn_rsh1sub_nc)') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) + +ASM_START() +PROLOGUE(func) + push {r4-r11} + ldr r4, [up], #4 + ldr r8, [vp], #4 + ADDSUB r4, r4, r8 + movs r12, r7, rrx + and r11, r4, #1 C return value + subs n, n, #4 + blo L(end) + +L(top): ldmia up!, {r5,r6,r7} + ldmia vp!, {r8,r9,r10} + cmn r12, r12 + ADDSUBC r5, r5, r8 + ADDSUBC r6, r6, r9 + ADDSUBC r7, r7, r10 + movs r12, r7, rrx + movs r6, r6, rrx + movs r5, r5, rrx + movs r4, r4, rrx + subs n, n, #3 + stmia rp!, {r4,r5,r6} + mov r4, r7 + bhs L(top) + +L(end): cmn n, #2 + bls L(e2) + ldm up, {r5,r6} + ldm vp, {r8,r9} + cmn r12, r12 + ADDSUBC r5, r5, r8 + ADDSUBC r6, r6, r9 + movs r12, r6, rrx + movs r5, r5, rrx + movs r4, r4, rrx + stmia rp!, {r4,r5} + mov r4, r6 + b L(e1) + +L(e2): bne L(e1) + ldr r5, [up, #0] + ldr r8, [vp, #0] + cmn r12, r12 + ADDSUBC r5, r5, r8 + movs r12, r5, rrx + movs r4, r4, rrx + str r4, [rp], #4 + mov r4, r5 + +L(e1): RSTCY( r12, r1) + mov r4, r4, rrx + str r4, [rp, #0] + mov r0, r11 + pop {r4-r11} + return r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/rshift.asm b/gmp-6.3.0/mpn/arm/rshift.asm new file mode 100644 index 0000000..9ddbc2e --- /dev/null +++ b/gmp-6.3.0/mpn/arm/rshift.asm @@ -0,0 +1,86 @@ +dnl ARM mpn_rshift. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 1997, 2000, 2001, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 3.5 +C Cortex-A15 ? + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`cnt', `r3') +define(`tnc', `r12') + +ASM_START() +PROLOGUE(mpn_rshift) + push {r4, r6, r7, r8} + ldr r4, [up] + rsb tnc, cnt, #32 + + mov r7, r4, lsr cnt + tst n, #1 + beq L(evn) C n even + +L(odd): subs n, n, #2 + bcc L(1) C n = 1 + ldr r8, [up, #4]! + b L(mid) + +L(evn): ldr r6, [up, #4]! + subs n, n, #2 + beq L(end) + +L(top): ldr r8, [up, #4]! + orr r7, r7, r6, lsl tnc + str r7, [rp], #4 + mov r7, r6, lsr cnt +L(mid): ldr r6, [up, #4]! + orr r7, r7, r8, lsl tnc + str r7, [rp], #4 + mov r7, r8, lsr cnt + subs n, n, #2 + bgt L(top) + +L(end): orr r7, r7, r6, lsl tnc + str r7, [rp], #4 + mov r7, r6, lsr cnt +L(1): str r7, [rp] + mov r0, r4, lsl tnc + pop {r4, r6, r7, r8} + return r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/sec_tabselect.asm b/gmp-6.3.0/mpn/arm/sec_tabselect.asm new file mode 100644 index 0000000..76a412b --- /dev/null +++ b/gmp-6.3.0/mpn/arm/sec_tabselect.asm @@ -0,0 +1,131 @@ +dnl ARM mpn_sec_tabselect + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 2.33 +C Cortex-A15 2.2 + +C TODO +C * Consider using special code for small nents, either swapping the inner and +C outer loops, or providing a few completely unrolling the inner loops. + +define(`rp', `r0') +define(`tp', `r1') +define(`n', `r2') +define(`nents', `r3') +C which on stack + +define(`i', `r11') +define(`j', `r12') +define(`c', `r14') +define(`mask', `r7') + +ASM_START() +PROLOGUE(mpn_sec_tabselect) + push {r4-r11, r14} + + subs j, n, #3 + bmi L(outer_end) +L(outer_top): + ldr c, [sp, #36] + mov i, nents + push {tp} + + mov r8, #0 + mov r9, #0 + mov r10, #0 + +L(top): subs c, c, #1 + ldm tp, {r4,r5,r6} + sbc mask, mask, mask + subs i, i, #1 + add tp, tp, n, lsl #2 + and r4, r4, mask + and r5, r5, mask + and r6, r6, mask + orr r8, r8, r4 + orr r9, r9, r5 + orr r10, r10, r6 + bge L(top) + + stmia rp!, {r8,r9,r10} + pop {tp} + add tp, tp, #12 + subs j, j, #3 + bpl L(outer_top) +L(outer_end): + + cmp j, #-1 + bne L(n2) + + ldr c, [sp, #36] + mov i, nents + mov r8, #0 + mov r9, #0 +L(tp2): subs c, c, #1 + sbc mask, mask, mask + ldm tp, {r4,r5} + subs i, i, #1 + add tp, tp, n, lsl #2 + and r4, r4, mask + and r5, r5, mask + orr r8, r8, r4 + orr r9, r9, r5 + bge L(tp2) + stmia rp, {r8,r9} + pop {r4-r11, r14} + return lr + +L(n2): cmp j, #-2 + bne L(n1) + + ldr c, [sp, #36] + mov i, nents + mov r8, #0 +L(tp1): subs c, c, #1 + sbc mask, mask, mask + ldr r4, [tp] + subs i, i, #1 + add tp, tp, n, lsl #2 + and r4, r4, mask + orr r8, r8, r4 + bge L(tp1) + str r8, [rp] +L(n1): pop {r4-r11, r14} + return lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/udiv.asm b/gmp-6.3.0/mpn/arm/udiv.asm new file mode 100644 index 0000000..7c04789 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/udiv.asm @@ -0,0 +1,104 @@ +dnl ARM mpn_udiv_qrnnd -- divide a two limb dividend and a one limb divisor. +dnl Return quotient and store remainder through a supplied pointer. + +dnl Copyright 2001, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +define(`rem_ptr',`r0') +define(`n1',`r1') +define(`n0',`r2') +define(`d',`r3') + +C divstep -- develop one quotient bit. Dividend in $1$2, divisor in $3. +C Quotient bit is shifted into $2. +define(`divstep', + `adcs $2, $2, $2 + adc $1, $1, $1 + cmp $1, $3 + subcs $1, $1, $3') + +ASM_START() +PROLOGUE(mpn_udiv_qrnnd) + mov r12, #8 C loop counter for both loops below + cmp d, #0x80000000 C check divisor msb and clear carry + bcs L(_large_divisor) + +L(oop): divstep(n1,n0,d) + divstep(n1,n0,d) + divstep(n1,n0,d) + divstep(n1,n0,d) + sub r12, r12, #1 + teq r12, #0 + bne L(oop) + + str n1, [rem_ptr] C store remainder + adc r0, n0, n0 C quotient: add last carry from divstep + return lr + +L(_large_divisor): + stmfd sp!, { r8, lr } + + and r8, n0, #1 C save lsb of dividend + mov lr, n1, lsl #31 + orrs n0, lr, n0, lsr #1 C n0 = lo(n1n0 >> 1) + mov n1, n1, lsr #1 C n1 = hi(n1n0 >> 1) + + and lr, d, #1 C save lsb of divisor + movs d, d, lsr #1 C d = floor(orig_d / 2) + adc d, d, #0 C d = ceil(orig_d / 2) + +L(oop2): + divstep(n1,n0,d) + divstep(n1,n0,d) + divstep(n1,n0,d) + divstep(n1,n0,d) + sub r12, r12, #1 + teq r12, #0 + bne L(oop2) + + adc n0, n0, n0 C shift and add last carry from divstep + add n1, r8, n1, lsl #1 C shift in omitted dividend lsb + tst lr, lr C test saved divisor lsb + beq L(_even_divisor) + + rsb d, lr, d, lsl #1 C restore orig d value + adds n1, n1, n0 C fix remainder for omitted divisor lsb + addcs n0, n0, #1 C adjust quotient if rem. fix carried + subcs n1, n1, d C adjust remainder accordingly + cmp n1, d C remainder >= divisor? + subcs n1, n1, d C adjust remainder + addcs n0, n0, #1 C adjust quotient + +L(_even_divisor): + str n1, [rem_ptr] C store remainder + mov r0, n0 C quotient + ldmfd sp!, { r8, pc } +EPILOGUE(mpn_udiv_qrnnd) diff --git a/gmp-6.3.0/mpn/arm/v5/gcd_11.asm b/gmp-6.3.0/mpn/arm/v5/gcd_11.asm new file mode 100644 index 0000000..3c2b48f --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v5/gcd_11.asm @@ -0,0 +1,70 @@ +dnl ARM v5 mpn_gcd_11. + +dnl Based on the K7 gcd_1.asm, by Kevin Ryde. Rehacked for ARM by Torbjörn +dnl Granlund. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C StrongARM - +C XScale ? +C Cortex-A5 6.45 obsolete +C Cortex-A7 6.41 obsolete +C Cortex-A8 5.0 obsolete +C Cortex-A9 5.9 obsolete +C Cortex-A15 4.40 obsolete +C Cortex-A17 5.68 obsolete +C Cortex-A53 4.37 obsolete +C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1 + +define(`u0', `r0') +define(`v0', `r1') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_gcd_11) + subs r3, u0, v0 C 0 + beq L(end) C + + ALIGN(16) +L(top): sub r2, v0, u0 C 0,5 + and r12, r2, r3 C 1 + clz r12, r12 C 2 + rsb r12, r12, #31 C 3 + rsbcc r3, r3, #0 C v = abs(u-v), even 1 + movcs u0, v0 C u = min(u,v) 1 + lsr v0, r3, r12 C 4 + subs r3, u0, v0 C 5 + bne L(top) C + +L(end): bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v5/gcd_22.asm b/gmp-6.3.0/mpn/arm/v5/gcd_22.asm new file mode 100644 index 0000000..0643b7c --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v5/gcd_22.asm @@ -0,0 +1,117 @@ +dnl ARM v5 mpn_gcd_22. + +dnl Copyright 2019, 2022 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C StrongARM - +C XScale - +C ARM11 13 +C Cortex-A5 ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 ? +C Cortex-A12 ? +C Cortex-A15 ? +C Cortex-A17 ? +C Cortex-A53 ? + + +define(`gp', `r0') + +define(`u1', `r1') +define(`u0', `r2') +define(`v1', `r3') +define(`v0', `r4') + +define(`t0', `r5') +define(`t1', `r6') +define(`cnt', `r7') + +ASM_START() +PROLOGUE(mpn_gcd_22) + push { r4-r7 } + + ldr v0, [sp,#16] C + +L(top): subs t0, u0, v0 C 0 7 + beq L(lowz) + sbcs t1, u1, v1 C 1 8 + + sub cnt, v0, u0 + and cnt, cnt, t0 + + negcc t0, t0 + mvncc t1, t1 +L(bck): movcc v0, u0 + movcc v1, u1 + + clz r12, cnt C 2 + rsb cnt, r12, #31 C 3 + add r12, r12, #1 + + lsr u0, t0, cnt C 3 + lsl r12, t1, r12 C 4 + lsr u1, t1, cnt C 3 + orr u0, u0, r12 C 5 + + orrs r12, u1, v1 + bne L(top) + + + str r12, [gp,#4] C high result limb <= 0 + + mov r6, gp + mov r0, u0 C pass 1st argument + mov r1, v0 C pass 2nd argument + mov r7, r14 C preserve link register + bl mpn_gcd_11 + str r0, [r6,#0] + mov r14, r7 + pop { r4-r7 } + bx r14 + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + subs t0, u1, v1 + beq L(end) + mov t1, #0 + sub cnt, v1, u1 + and cnt, cnt, t0 + negcc t0, t0 + b L(bck) + +L(end): str v0, [gp,#0] + str v1, [gp,#4] + pop { r4-r7 } + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v5/mod_1_1.asm b/gmp-6.3.0/mpn/arm/v5/mod_1_1.asm new file mode 100644 index 0000000..3cf0cd7 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v5/mod_1_1.asm @@ -0,0 +1,129 @@ +dnl ARM mpn_mod_1_1p + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM - +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 7 +C Cortex-A15 6 + +define(`ap', `r0') +define(`n', `r1') +define(`d', `r2') +define(`cps',`r3') + +ASM_START() +PROLOGUE(mpn_mod_1_1p) + push {r4-r10} + add r0, r0, r1, asl #2 + ldr r5, [r0, #-4]! + ldr r12, [r0, #-4]! + subs r1, r1, #2 + ble L(4) + ldr r8, [r3, #12] + mov r4, r12 + mov r10, r5 + umull r7, r5, r10, r8 + sub r1, r1, #1 + b L(mid) + +L(top): adds r12, r6, r7 + adcs r10, r4, r5 + sub r1, r1, #1 + mov r6, #0 + movcs r6, r8 + umull r7, r5, r10, r8 + adds r4, r12, r6 + subcs r4, r4, r2 +L(mid): ldr r6, [r0, #-4]! + teq r1, #0 + bne L(top) + + adds r12, r6, r7 + adcs r5, r4, r5 + subcs r5, r5, r2 +L(4): ldr r1, [r3, #4] + cmp r1, #0 + beq L(7) + ldr r4, [r3, #8] + umull r0, r6, r5, r4 + adds r12, r0, r12 + addcs r6, r6, #1 + rsb r0, r1, #32 + mov r0, r12, lsr r0 + orr r5, r0, r6, asl r1 + mov r12, r12, asl r1 + b L(8) +L(7): cmp r5, r2 + subcs r5, r5, r2 +L(8): ldr r0, [r3, #0] + umull r4, r3, r5, r0 + add r5, r5, #1 + adds r0, r4, r12 + adc r5, r3, r5 + mul r5, r2, r5 + sub r12, r12, r5 + cmp r12, r0 + addhi r12, r12, r2 + cmp r2, r12 + subls r12, r12, r2 + mov r0, r12, lsr r1 + pop {r4-r10} + bx r14 +EPILOGUE() + +PROLOGUE(mpn_mod_1_1p_cps) + stmfd sp!, {r4, r5, r6, r14} + mov r5, r0 + clz r4, r1 + mov r0, r1, asl r4 + rsb r6, r0, #0 + bl mpn_invert_limb + str r0, [r5, #0] + str r4, [r5, #4] + cmp r4, #0 + beq L(2) + rsb r1, r4, #32 + mov r3, #1 + mov r3, r3, asl r4 + orr r3, r3, r0, lsr r1 + mul r3, r6, r3 + mov r4, r3, lsr r4 + str r4, [r5, #8] +L(2): mul r0, r6, r0 + str r0, [r5, #12] + ldmfd sp!, {r4, r5, r6, pc} +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v5/mod_1_2.asm b/gmp-6.3.0/mpn/arm/v5/mod_1_2.asm new file mode 100644 index 0000000..aa26ecb --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v5/mod_1_2.asm @@ -0,0 +1,156 @@ +dnl ARM mpn_mod_1s_2p + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM - +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 4.25 +C Cortex-A15 3 + +define(`ap', `r0') +define(`n', `r1') +define(`d', `r2') +define(`cps',`r3') + +ASM_START() +PROLOGUE(mpn_mod_1s_2p) + push {r4-r10} + tst n, #1 + add r7, r3, #8 + ldmia r7, {r7, r8, r12} C load B1, B2, B3 + add ap, ap, n, lsl #2 C put ap at operand end + beq L(evn) + +L(odd): subs n, n, #1 + beq L(1) + ldmdb ap!, {r4,r6,r9} + mov r10, #0 + umlal r4, r10, r6, r7 + umlal r4, r10, r9, r8 + b L(com) + +L(evn): ldmdb ap!, {r4,r10} +L(com): subs n, n, #2 + ble L(end) + ldmdb ap!, {r5,r6} + b L(mid) + +L(top): mov r9, #0 + umlal r5, r9, r6, r7 C B1 + umlal r5, r9, r4, r8 C B2 + ldmdb ap!, {r4,r6} + umlal r5, r9, r10, r12 C B3 + ble L(xit) + mov r10, #0 + umlal r4, r10, r6, r7 C B1 + umlal r4, r10, r5, r8 C B2 + ldmdb ap!, {r5,r6} + umlal r4, r10, r9, r12 C B3 +L(mid): subs n, n, #4 + bge L(top) + + mov r9, #0 + umlal r5, r9, r6, r7 C B1 + umlal r5, r9, r4, r8 C B2 + umlal r5, r9, r10, r12 C B3 + mov r4, r5 + +L(end): movge r9, r10 C executed iff coming via xit + ldr r6, [r3, #4] C cps[1] = cnt + mov r5, #0 + umlal r4, r5, r9, r7 + mov r7, r5, lsl r6 +L(x): rsb r1, r6, #32 + orr r8, r7, r4, lsr r1 + mov r9, r4, lsl r6 + ldr r5, [r3, #0] + add r0, r8, #1 + umull r12, r1, r8, r5 + adds r4, r12, r9 + adc r1, r1, r0 + mul r5, r2, r1 + sub r9, r9, r5 + cmp r9, r4 + addhi r9, r9, r2 + cmp r2, r9 + subls r9, r9, r2 + mov r0, r9, lsr r6 + pop {r4-r10} + bx r14 + +L(xit): mov r10, #0 + umlal r4, r10, r6, r7 C B1 + umlal r4, r10, r5, r8 C B2 + umlal r4, r10, r9, r12 C B3 + b L(end) + +L(1): ldr r6, [r3, #4] C cps[1] = cnt + ldr r4, [ap, #-4] C ap[0] + mov r7, #0 + b L(x) +EPILOGUE() + +PROLOGUE(mpn_mod_1s_2p_cps) + push {r4-r8, r14} + clz r4, r1 + mov r5, r1, lsl r4 C b <<= cnt + mov r6, r0 C r6 = cps + mov r0, r5 + bl mpn_invert_limb + rsb r3, r4, #32 + mov r3, r0, lsr r3 + mov r2, #1 + orr r3, r3, r2, lsl r4 + rsb r1, r5, #0 + mul r2, r1, r3 + umull r3, r12, r2, r0 + add r12, r2, r12 + mvn r12, r12 + mul r1, r5, r12 + cmp r1, r3 + addhi r1, r1, r5 + umull r12, r7, r1, r0 + add r7, r1, r7 + mvn r7, r7 + mul r3, r5, r7 + cmp r3, r12 + addhi r3, r3, r5 + mov r5, r2, lsr r4 + mov r7, r1, lsr r4 + mov r8, r3, lsr r4 + stmia r6, {r0,r4,r5,r7,r8} C fill cps + pop {r4-r8, pc} +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/addmul_1.asm b/gmp-6.3.0/mpn/arm/v6/addmul_1.asm new file mode 100644 index 0000000..a38af58 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/addmul_1.asm @@ -0,0 +1,112 @@ +dnl ARM mpn_addmul_1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C ARM11 6.4 +C Cortex-A7 5.25 +C Cortex-A8 7 +C Cortex-A9 3.25 +C Cortex-A15 4 + +C TODO +C * Micro-optimise feed-in code. +C * Optimise for n=1,2 by delaying register saving. +C * Try using ldm/stm. + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`v0',`r3') + +ASM_START() +PROLOGUE(mpn_addmul_1) + stmfd sp!, { r4, r5, r6, r7 } + + ands r6, n, #3 + mov r12, #0 + beq L(fi0) + cmp r6, #2 + bcc L(fi1) + beq L(fi2) + +L(fi3): ldr r4, [up], #4 + ldr r6, [rp, #0] + ldr r5, [up], #4 + b L(lo3) + +L(fi0): ldr r5, [up], #4 + ldr r7, [rp], #4 + ldr r4, [up], #4 + b L(lo0) + +L(fi1): ldr r4, [up], #4 + ldr r6, [rp], #8 + subs n, n, #1 + beq L(1) + ldr r5, [up], #4 + b L(lo1) + +L(fi2): ldr r5, [up], #4 + ldr r7, [rp], #12 + ldr r4, [up], #4 + b L(lo2) + + ALIGN(16) +L(top): ldr r6, [rp, #-8] + ldr r5, [up], #4 + str r7, [rp, #-12] +L(lo1): umaal r6, r12, r4, v0 + ldr r7, [rp, #-4] + ldr r4, [up], #4 + str r6, [rp, #-8] +L(lo0): umaal r7, r12, r5, v0 + ldr r6, [rp, #0] + ldr r5, [up], #4 + str r7, [rp, #-4] +L(lo3): umaal r6, r12, r4, v0 + ldr r7, [rp, #4] + ldr r4, [up], #4 + str r6, [rp], #16 +L(lo2): umaal r7, r12, r5, v0 + subs n, n, #4 + bhi L(top) + + ldr r6, [rp, #-8] + str r7, [rp, #-12] +L(1): umaal r6, r12, r4, v0 + str r6, [rp, #-8] + mov r0, r12 + ldmfd sp!, { r4, r5, r6, r7 } + bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/addmul_2.asm b/gmp-6.3.0/mpn/arm/v6/addmul_2.asm new file mode 100644 index 0000000..69d0b8f --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/addmul_2.asm @@ -0,0 +1,125 @@ +dnl ARM mpn_addmul_2. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C ARM11 4.68 +C Cortex-A5 3.63 +C Cortex-A7 3.65 +C Cortex-A8 4.0 +C Cortex-A9 2.25 +C Cortex-A15 2.5 +C Cortex-A17 2.13 +C Cortex-A53 3.5 + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`vp',`r3') + +define(`v0',`r6') +define(`v1',`r7') +define(`u0',`r3') +define(`u1',`r9') + +define(`cya',`r8') +define(`cyb',`r12') + + +ASM_START() +PROLOGUE(mpn_addmul_2) + push { r4-r9 } + + ldrd v0, v1, [vp, #0] + mov cya, #0 + mov cyb, #0 + + tst n, #1 + beq L(evn) + +L(odd): ldr u1, [up, #0] + ldr r4, [rp, #0] + tst n, #2 + beq L(fi1) +L(fi3): sub up, up, #8 + sub rp, rp, #8 + b L(lo3) +L(fi1): sub n, n, #1 + b L(top) + +L(evn): ldr u0, [up, #0] + ldr r5, [rp, #0] + tst n, #2 + bne L(fi2) +L(fi0): sub up, up, #4 + sub rp, rp, #4 + b L(lo0) +L(fi2): sub up, up, #12 + sub rp, rp, #12 + b L(lo2) + + ALIGN(16) +L(top): ldr r5, [rp, #4] + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #0] +L(lo0): ldr r4, [rp, #8] + umaal r5, cya, u0, v0 + ldr u1, [up, #8] + umaal r4, cyb, u0, v1 + str r5, [rp, #4] +L(lo3): ldr r5, [rp, #12] + umaal r4, cya, u1, v0 + ldr u0, [up, #12] + umaal r5, cyb, u1, v1 + str r4, [rp, #8] +L(lo2): ldr r4, [rp, #16]! + umaal r5, cya, u0, v0 + ldr u1, [up, #16]! + umaal r4, cyb, u0, v1 + str r5, [rp, #-4] + subs n, n, #4 + bhi L(top) + +L(end): umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #0] + str cya, [rp, #4] + mov r0, cyb + + pop { r4-r9 } + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/addmul_3.asm b/gmp-6.3.0/mpn/arm/v6/addmul_3.asm new file mode 100644 index 0000000..d1490cd --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/addmul_3.asm @@ -0,0 +1,191 @@ +dnl ARM mpn_addmul_3. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C ARM11 4.33 +C Cortex-A5 3.28 +C Cortex-A7 3.25 +C Cortex-A8 3.17 +C Cortex-A9 2.125 +C Cortex-A15 2 +C Cortex-A17 2.11 +C Cortex-A53 4.18 + +C TODO +C * Use a fast path for n <= KARATSUBA_MUL_THRESHOLD using a jump table, +C avoiding the current multiply. +C * Start the first multiply or multiplies early. + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`vp',`r3') + +define(`v0',`r4') define(`v1',`r5') define(`v2',`r6') +define(`u0',`r3') define(`u1',`r14') +define(`w0',`r7') define(`w1',`r8') define(`w2',`r9') +define(`cy0',`r10') define(`cy1',`r11') define(`cy2',`r12') + + +ASM_START() +PROLOGUE(mpn_addmul_3) + push { r4-r11, r14 } + + ldr w0, =0xaaaaaaab C 3^{-1} mod 2^32 + ldm vp, { v0,v1,v2 } + mov cy0, #0 + mov cy1, #0 + mov cy2, #0 + +C Tricky n mod 6 + mul w0, w0, n C n * 3^{-1} mod 2^32 + and w0, w0, #0xc0000001 C pseudo-CRT mod 3,2 + sub n, n, #3 +ifdef(`PIC',` + add pc, pc, w0, ror $28 + nop + b L(b0) + b L(b2) + b L(b4) + .word 0xe7f000f0 C udf + b L(b3) + b L(b5) + b L(b1) +',` + ldr pc, [pc, w0, ror $28] + nop + .word L(b0), L(b2), L(b4), 0, L(b3), L(b5), L(b1) +') + +L(b5): add up, up, #-8 + ldr w1, [rp, #0] + ldr w2, [rp, #4] + ldr u1, [up, #8] + b L(lo5) + +L(b4): add rp, rp, #-4 + add up, up, #-12 + ldr w2, [rp, #4] + ldr w0, [rp, #8] + ldr u0, [up, #12] + b L(lo4) + +L(b3): add rp, rp, #-8 + add up, up, #-16 + ldr w0, [rp, #8] + ldr w1, [rp, #12] + ldr u1, [up, #16] + b L(lo3) + +L(b1): add rp, rp, #8 + ldr w2, [rp, #-8] + ldr w0, [rp, #-4] + ldr u1, [up, #0] + b L(lo1) + +L(b0): add rp, rp, #4 + add up, up, #-4 + ldr w0, [rp, #-4] + ldr w1, [rp, #0] + ldr u0, [up, #4] + b L(lo0) + +L(b2): add rp, rp, #12 + add up, up, #4 + ldr w1, [rp, #-12] + ldr w2, [rp, #-8] + ldr u0, [up, #-4] + + ALIGN(16) +L(top): ldr w0, [rp, #-4] + umaal w1, cy0, u0, v0 + ldr u1, [up, #0] + umaal w2, cy1, u0, v1 + str w1, [rp, #-12] + umaal w0, cy2, u0, v2 +L(lo1): ldr w1, [rp, #0] + umaal w2, cy0, u1, v0 + ldr u0, [up, #4] + umaal w0, cy1, u1, v1 + str w2, [rp, #-8] + umaal w1, cy2, u1, v2 +L(lo0): ldr w2, [rp, #4] + umaal w0, cy0, u0, v0 + ldr u1, [up, #8] + umaal w1, cy1, u0, v1 + str w0, [rp, #-4] + umaal w2, cy2, u0, v2 +L(lo5): ldr w0, [rp, #8] + umaal w1, cy0, u1, v0 + ldr u0, [up, #12] + umaal w2, cy1, u1, v1 + str w1, [rp, #0] + umaal w0, cy2, u1, v2 +L(lo4): ldr w1, [rp, #12] + umaal w2, cy0, u0, v0 + ldr u1, [up, #16] + umaal w0, cy1, u0, v1 + str w2, [rp, #4] + umaal w1, cy2, u0, v2 +L(lo3): ldr w2, [rp, #16] + umaal w0, cy0, u1, v0 + ldr u0, [up, #20] + umaal w1, cy1, u1, v1 + str w0, [rp, #8] + umaal w2, cy2, u1, v2 +L(lo2): subs n, n, #6 + add up, up, #24 + add rp, rp, #24 + bge L(top) + +L(end): umaal w1, cy0, u0, v0 + ldr u1, [up, #0] + umaal w2, cy1, u0, v1 + str w1, [rp, #-12] + mov w0, #0 + umaal w0, cy2, u0, v2 + umaal w2, cy0, u1, v0 + umaal w0, cy1, u1, v1 + str w2, [rp, #-8] + umaal cy1, cy2, u1, v2 + adds w0, w0, cy0 + str w0, [rp, #-4] + adcs w1, cy1, #0 + str w1, [rp, #0] + adc r0, cy2, #0 + + pop { r4-r11, pc } +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/dive_1.asm b/gmp-6.3.0/mpn/arm/v6/dive_1.asm new file mode 100644 index 0000000..92de814 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/dive_1.asm @@ -0,0 +1,149 @@ +dnl ARM v6 mpn_divexact_1 + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb +C norm unorm modexact_1c_odd +C StrongARM - - +C XScale - - +C Cortex-A7 ? ? +C Cortex-A8 ? ? +C Cortex-A9 9 10 9 +C Cortex-A15 7 7 7 + +C Architecture requirements: +C v5 - +C v5t clz +C v5te - +C v6 umaal +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`d', `r3') + +define(`cy', `r7') +define(`cnt', `r6') +define(`tnc', `r10') + +ASM_START() +PROLOGUE(mpn_divexact_1) + push {r4,r5,r6,r7,r8,r9} + + tst d, #1 + + rsb r4, d, #0 + and r4, r4, d + clz r4, r4 + rsb cnt, r4, #31 C count_trailing_zeros + mov d, d, lsr cnt + +C binvert limb + LEA( r4, binvert_limb_table) + and r12, d, #254 + ldrb r4, [r4, r12, lsr #1] + mul r12, r4, r4 + mul r12, d, r12 + rsb r12, r12, r4, lsl #1 + mul r4, r12, r12 + mul r4, d, r4 + rsb r4, r4, r12, lsl #1 C r4 = inverse + + ldr r5, [up], #4 C up[0] + mov cy, #0 + rsb r8, r4, #0 C r8 = -inverse + beq L(unnorm) + +L(norm): + subs n, n, #1 + mul r5, r5, r4 + beq L(end) + + ALIGN(16) +L(top): ldr r9, [up], #4 + mov r12, #0 + str r5, [rp], #4 + umaal r12, cy, r5, d + mul r5, r9, r4 + mla r5, cy, r8, r5 + subs n, n, #1 + bne L(top) + +L(end): str r5, [rp] + pop {r4,r5,r6,r7,r8,r9} + bx r14 + +L(unnorm): + push {r10,r11} + rsb tnc, cnt, #32 + mov r11, r5, lsr cnt + subs n, n, #1 + beq L(edx) + + ldr r12, [up], #4 + orr r9, r11, r12, lsl tnc + mov r11, r12, lsr cnt + mul r5, r9, r4 + subs n, n, #1 + beq L(edu) + + ALIGN(16) +L(tpu): ldr r12, [up], #4 + orr r9, r11, r12, lsl tnc + mov r11, r12, lsr cnt + mov r12, #0 + str r5, [rp], #4 + umaal r12, cy, r5, d + mul r5, r9, r4 + mla r5, cy, r8, r5 + subs n, n, #1 + bne L(tpu) + +L(edu): str r5, [rp], #4 + mov r12, #0 + umaal r12, cy, r5, d + mul r5, r11, r4 + mla r5, cy, r8, r5 + str r5, [rp] + pop {r10,r11} + pop {r4,r5,r6,r7,r8,r9} + bx r14 + +L(edx): mul r5, r11, r4 + str r5, [rp] + pop {r10,r11} + pop {r4,r5,r6,r7,r8,r9} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/gmp-mparam.h b/gmp-6.3.0/mpn/arm/v6/gmp-mparam.h new file mode 100644 index 0000000..35a7c55 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/gmp-mparam.h @@ -0,0 +1,187 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 700 MHz ARM11 (raspberry pi) */ +/* FFT tuning limit = 8,088,775 */ +/* Generated by tuneup.c, 2019-10-23, gcc 8.3 */ + +#define DIVREM_1_NORM_THRESHOLD 0 /* preinv always */ +#define DIVREM_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 6 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 19 +#define USE_PREINV_DIVREM_1 1 /* preinv always */ +#define DIV_QR_1N_PI1_METHOD 1 /* 71.61% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 38 + +#define DIV_1_VS_MUL_1_PERCENT 251 + +#define MUL_TOOM22_THRESHOLD 38 +#define MUL_TOOM33_THRESHOLD 134 +#define MUL_TOOM44_THRESHOLD 512 +#define MUL_TOOM6H_THRESHOLD 0 /* always */ +#define MUL_TOOM8H_THRESHOLD 620 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 209 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 625 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 209 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 211 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 300 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 55 +#define SQR_TOOM3_THRESHOLD 200 +#define SQR_TOOM4_THRESHOLD 470 +#define SQR_TOOM6_THRESHOLD 614 +#define SQR_TOOM8_THRESHOLD 882 + +#define MULMID_TOOM42_THRESHOLD 62 + +#define MULMOD_BNM1_THRESHOLD 23 +#define SQRMOD_BNM1_THRESHOLD 26 + +#define MUL_FFT_MODF_THRESHOLD 565 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 565, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 15, 5}, { 31, 6}, { 28, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 21, 6}, { 43, 7}, { 23, 6}, \ + { 47, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \ + { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 71, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \ + { 99, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 159,11}, \ + { 95,10}, { 207,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271,11}, { 159,10}, \ + { 351,11}, { 191,10}, { 399,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 511,11}, { 287,10}, \ + { 607,11}, { 319,10}, { 639,11}, { 351,12}, \ + { 191,11}, { 415,13}, { 127,12}, { 255,11}, \ + { 575,12}, { 319,11}, { 671,12}, { 383,11}, \ + { 799,12}, { 447,13}, { 255,12}, { 511,11}, \ + { 1023,12}, { 703,13}, { 383,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1151,13}, { 639,12}, \ + { 1343,13}, { 767,12}, { 1599,13}, { 895,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 98 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 530 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 530, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 28, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 21, 6}, \ + { 43, 7}, { 23, 6}, { 47, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 49, 8}, { 27, 7}, { 55, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 71, 9}, { 39, 8}, \ + { 83, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 167,10}, { 95, 9}, { 191,10}, { 111,11}, \ + { 63,10}, { 143, 9}, { 287,10}, { 159,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 351,11}, \ + { 191,10}, { 415,11}, { 223,12}, { 127,11}, \ + { 255,10}, { 543,11}, { 287,10}, { 607,11}, \ + { 319,10}, { 639,11}, { 351,12}, { 191,11}, \ + { 383,10}, { 767,11}, { 415,13}, { 127,12}, \ + { 255,11}, { 607,12}, { 319,11}, { 703,12}, \ + { 383,11}, { 799,12}, { 447,11}, { 895,13}, \ + { 255,12}, { 511,11}, { 1023,12}, { 703,13}, \ + { 383,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1151,13}, { 639,12}, { 1343,13}, { 767,12}, \ + { 1599,13}, { 895,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 104 +#define SQR_FFT_THRESHOLD 4416 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 51 +#define MULLO_MUL_N_THRESHOLD 11278 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 55 +#define SQRLO_SQR_THRESHOLD 8648 + +#define DC_DIV_QR_THRESHOLD 36 +#define DC_DIVAPPR_Q_THRESHOLD 146 +#define DC_BDIV_QR_THRESHOLD 46 +#define DC_BDIV_Q_THRESHOLD 160 + +#define INV_MULMOD_BNM1_THRESHOLD 74 +#define INV_NEWTON_THRESHOLD 145 +#define INV_APPR_THRESHOLD 147 + +#define BINV_NEWTON_THRESHOLD 372 +#define REDC_1_TO_REDC_2_THRESHOLD 6 +#define REDC_2_TO_REDC_N_THRESHOLD 140 + +#define MU_DIV_QR_THRESHOLD 2801 +#define MU_DIVAPPR_Q_THRESHOLD 2801 +#define MUPI_DIV_QR_THRESHOLD 79 +#define MU_BDIV_QR_THRESHOLD 2541 +#define MU_BDIV_Q_THRESHOLD 2764 + +#define POWM_SEC_TABLE 3,20,139,734 + +#define GET_STR_DC_THRESHOLD 27 +#define GET_STR_PRECOMPUTE_THRESHOLD 45 +#define SET_STR_DC_THRESHOLD 342 +#define SET_STR_PRECOMPUTE_THRESHOLD 1290 + +#define FAC_DSC_THRESHOLD 390 +#define FAC_ODD_THRESHOLD 438 + +#define MATRIX22_STRASSEN_THRESHOLD 25 +#define HGCD2_DIV1_METHOD 5 /* 1.32% faster than 3 */ +#define HGCD_THRESHOLD 82 +#define HGCD_APPR_THRESHOLD 81 +#define HGCD_REDUCE_THRESHOLD 4633 +#define GCD_DC_THRESHOLD 345 +#define GCDEXT_DC_THRESHOLD 268 +#define JACOBI_BASE_METHOD 1 /* 3.30% faster than 2 */ + +/* Tuneup completed successfully, took 45018 seconds */ diff --git a/gmp-6.3.0/mpn/arm/v6/mode1o.asm b/gmp-6.3.0/mpn/arm/v6/mode1o.asm new file mode 100644 index 0000000..a2f77a6 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/mode1o.asm @@ -0,0 +1,95 @@ +dnl ARM v6 mpn_modexact_1c_odd + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 9 +C Cortex-A15 7 + +C Architecture requirements: +C v5 - +C v5t - +C v5te smulbb +C v6 umaal +C v6t2 - +C v7a - + +define(`up', `r0') +define(`n', `r1') +define(`d', `r2') +define(`cy', `r3') + + .protected binvert_limb_table +ASM_START() +PROLOGUE(mpn_modexact_1c_odd) + stmfd sp!, {r4, r5, r6, r7} + + LEA( r4, binvert_limb_table) + + ldr r6, [up], #4 C up[0] + + and r12, d, #254 + ldrb r4, [r4, r12, lsr #1] + smulbb r12, r4, r4 + mul r12, d, r12 + rsb r12, r12, r4, asl #1 + mul r4, r12, r12 + mul r4, d, r4 + rsb r4, r4, r12, asl #1 C r4 = inverse + + subs n, n, #1 + sub r6, r6, cy + mul r6, r6, r4 + beq L(end) + + rsb r5, r4, #0 C r5 = -inverse + +L(top): ldr r7, [up], #4 + mov r12, #0 + umaal r12, cy, r6, d + mul r6, r7, r4 + mla r6, cy, r5, r6 + subs n, n, #1 + bne L(top) + +L(end): mov r12, #0 + umaal r12, cy, r6, d + mov r0, cy + + ldmfd sp!, {r4, r5, r6, r7} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/mul_1.asm b/gmp-6.3.0/mpn/arm/v6/mul_1.asm new file mode 100644 index 0000000..3c6ef99 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/mul_1.asm @@ -0,0 +1,115 @@ +dnl ARM mpn_mul_1. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C ARM11 6.4 +C Cortex-A7 5.25 +C Cortex-A8 7 +C Cortex-A9 3.25 +C Cortex-A15 4 + +C TODO +C * Micro-optimise feed-in code. +C * Optimise for n=1,2 by delaying register saving. +C * Try using ldm/stm. + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`v0',`r3') + +ASM_START() +PROLOGUE(mpn_mul_1) + stmfd sp!, { r4, r5, r6, r7 } + + ands r6, n, #3 + mov r12, #0 + beq L(fi0) + cmp r6, #2 + bcc L(fi1) + beq L(fi2) + +L(fi3): ldr r4, [up], #4 + mov r6, #0 + ldr r5, [up], #4 + b L(lo3) + +L(fi0): ldr r5, [up], #4 + add rp, rp, #4 + mov r7, #0 + ldr r4, [up], #4 + b L(lo0) + +L(fi1): ldr r4, [up], #4 + mov r6, #0 + add rp, rp, #8 + subs n, n, #1 + beq L(1) + ldr r5, [up], #4 + b L(lo1) + +L(fi2): ldr r5, [up], #4 + add rp, rp, #12 + mov r7, #0 + ldr r4, [up], #4 + b L(lo2) + + ALIGN(16) +L(top): mov r6, #0 + ldr r5, [up], #4 + str r7, [rp, #-12] +L(lo1): umaal r6, r12, r4, v0 + mov r7, #0 + ldr r4, [up], #4 + str r6, [rp, #-8] +L(lo0): umaal r7, r12, r5, v0 + mov r6, #0 + ldr r5, [up], #4 + str r7, [rp, #-4] +L(lo3): umaal r6, r12, r4, v0 + mov r7, #0 + ldr r4, [up], #4 + str r6, [rp], #16 +L(lo2): umaal r7, r12, r5, v0 + subs n, n, #4 + bhi L(top) + + mov r6, #0 + str r7, [rp, #-12] +L(1): umaal r6, r12, r4, v0 + str r6, [rp, #-8] + mov r0, r12 + ldmfd sp!, { r4, r5, r6, r7 } + bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/mul_2.asm b/gmp-6.3.0/mpn/arm/v6/mul_2.asm new file mode 100644 index 0000000..edd27f3 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/mul_2.asm @@ -0,0 +1,135 @@ +dnl ARM mpn_mul_2. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C ARM11 5.25 +C Cortex-A5 3.63 +C Cortex-A7 3.15 +C Cortex-A8 5.0 +C Cortex-A9 2.25 +C Cortex-A15 2.5 +C Cortex-A17 2.13 +C Cortex-A53 3.5 + +C TODO +C * This is a trivial edit of the addmul_2 code. Check for simplifications, +C and possible speedups to 2.0 c/l. + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`vp',`r3') + +define(`v0',`r6') +define(`v1',`r7') +define(`u0',`r3') +define(`u1',`r9') + +define(`cya',`r8') +define(`cyb',`r12') + + +ASM_START() +PROLOGUE(mpn_mul_2) + push { r4, r5, r6, r7, r8, r9 } + + ldm vp, { v0, v1 } + mov cya, #0 + mov cyb, #0 + + tst n, #1 + beq L(evn) +L(odd): mov r5, #0 + ldr u0, [up, #0] + mov r4, #0 + tst n, #2 + beq L(fi1) +L(fi3): sub up, up, #12 + sub rp, rp, #16 + b L(lo3) +L(fi1): sub n, n, #1 + sub up, up, #4 + sub rp, rp, #8 + b L(lo1) +L(evn): mov r4, #0 + ldr u1, [up, #0] + mov r5, #0 + tst n, #2 + bne L(fi2) +L(fi0): sub up, up, #8 + sub rp, rp, #12 + b L(lo0) +L(fi2): subs n, n, #2 + sub rp, rp, #4 + bls L(end) + + ALIGN(16) +L(top): ldr u0, [up, #4] + umaal r4, cya, u1, v0 + str r4, [rp, #4] + mov r4, #0 + umaal r5, cyb, u1, v1 +L(lo1): ldr u1, [up, #8] + umaal r5, cya, u0, v0 + str r5, [rp, #8] + mov r5, #0 + umaal r4, cyb, u0, v1 +L(lo0): ldr u0, [up, #12] + umaal r4, cya, u1, v0 + str r4, [rp, #12] + mov r4, #0 + umaal r5, cyb, u1, v1 +L(lo3): ldr u1, [up, #16]! + umaal r5, cya, u0, v0 + str r5, [rp, #16]! + mov r5, #0 + umaal r4, cyb, u0, v1 + subs n, n, #4 + bhi L(top) + +L(end): umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #4] + umaal r5, cya, u0, v0 + umaal cya, cyb, u0, v1 + str r5, [rp, #8] + str cya, [rp, #12] + mov r0, cyb + + pop { r4, r5, r6, r7, r8, r9 } + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/popham.asm b/gmp-6.3.0/mpn/arm/v6/popham.asm new file mode 100644 index 0000000..c254368 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/popham.asm @@ -0,0 +1,139 @@ +dnl ARM mpn_popcount and mpn_hamdist. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C popcount hamdist +C cycles/limb cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 8.94 9.47 +C Cortex-A15 5.67 6.44 + +C Architecture requirements: +C v5 - +C v5t - +C v5te ldrd strd +C v6 usada8 +C v6t2 - +C v7a - + +ifdef(`OPERATION_popcount',` + define(`func',`mpn_popcount') + define(`ap', `r0') + define(`n', `r1') + define(`a0', `r2') + define(`a1', `r3') + define(`s', `r5') + define(`b_01010101', `r6') + define(`b_00110011', `r7') + define(`b_00001111', `r8') + define(`zero', `r9') + define(`POPC', `$1') + define(`HAMD', `dnl') +') +ifdef(`OPERATION_hamdist',` + define(`func',`mpn_hamdist') + define(`ap', `r0') + define(`bp', `r1') + define(`n', `r2') + define(`a0', `r6') + define(`a1', `r7') + define(`b0', `r4') + define(`b1', `r5') + define(`s', `r11') + define(`b_01010101', `r8') + define(`b_00110011', `r9') + define(`b_00001111', `r10') + define(`zero', `r3') + define(`POPC', `dnl') + define(`HAMD', `$1') +') + +MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) + +ASM_START() +PROLOGUE(func) +POPC(` push { r4-r9 } ') +HAMD(` push { r4-r11 } ') + + ldr b_01010101, =0x55555555 + mov r12, #0 + ldr b_00110011, =0x33333333 + mov zero, #0 + ldr b_00001111, =0x0f0f0f0f + + tst n, #1 + beq L(evn) + +L(odd): ldr a1, [ap], #4 C 1 x 32 1-bit accumulators, 0-1 +HAMD(` ldr b1, [bp], #4 ') C 1 x 32 1-bit accumulators, 0-1 +HAMD(` eor a1, a1, b1 ') + and r4, b_01010101, a1, lsr #1 + sub a1, a1, r4 + and r4, a1, b_00110011 + bic r5, a1, b_00110011 + add r5, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4 + subs n, n, #1 + b L(mid) + +L(evn): mov s, #0 + +L(top): ldrd a0, a1, [ap], #8 C 2 x 32 1-bit accumulators, 0-1 +HAMD(` ldrd b0, b1, [bp], #8') +HAMD(` eor a0, a0, b0 ') +HAMD(` eor a1, a1, b1 ') + subs n, n, #2 + usada8 r12, s, zero, r12 + and r4, b_01010101, a0, lsr #1 + sub a0, a0, r4 + and r4, b_01010101, a1, lsr #1 + sub a1, a1, r4 + and r4, a0, b_00110011 + bic r5, a0, b_00110011 + add a0, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4 + and r4, a1, b_00110011 + bic r5, a1, b_00110011 + add a1, r4, r5, lsr #2 C 8 4-bit accumulators, 0-4 + add r5, a0, a1 C 8 4-bit accumulators, 0-8 +L(mid): and r4, r5, b_00001111 + bic r5, r5, b_00001111 + add s, r4, r5, lsr #4 C 4 8-bit accumulators + bne L(top) + + usada8 r0, s, zero, r12 +POPC(` pop { r4-r9 } ') +HAMD(` pop { r4-r11 } ') + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm b/gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm new file mode 100644 index 0000000..0fc4f13 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/sqr_basecase.asm @@ -0,0 +1,544 @@ +dnl ARM v6 mpn_sqr_basecase. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2015 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C Code structure: +C +C +C m_2(0m4) m_2(2m4) m_2(1m4) m_2(3m4) +C | | | | +C | | | | +C | | | | +C \|/ \|/ \|/ \|/ +C ____________ ____________ +C / \ / \ +C \|/ \ \|/ \ +C am_2(3m4) am_2(1m4) am_2(0m4) am_2(2m4) +C \ /|\ \ /|\ +C \____________/ \____________/ +C \ / +C \ / +C \ / +C cor3 cor2 +C \ / +C \ / +C sqr_diag_addlsh1 + +C TODO +C * Align more labels. +C * Further tweak counter and updates in outer loops. (This could save +C perhaps 5n cycles). +C * Avoid sub-with-lsl in outer loops. We could keep n up-shifted, then +C initialise loop counter i with a right shift. +C * Try to use fewer register. Perhaps coalesce r9 branch target and n_saved. +C (This could save 2-3 cycles for n > 4.) +C * Optimise sqr_diag_addlsh1 loop. The current code uses old-style carry +C propagation. +C * Stop loops earlier suppressing writes of upper-most rp[] values. +C * The addmul_2 loops here runs well on all cores, but mul_2 runs poorly +C particularly on Cortex-A8. + + +define(`rp', r0) +define(`up', r1) +define(`n', r2) + +define(`v0', r3) +define(`v1', r6) +define(`i', r8) +define(`n_saved', r14) +define(`cya', r11) +define(`cyb', r12) +define(`u0', r7) +define(`u1', r9) + +ASM_START() +PROLOGUE(mpn_sqr_basecase) + and r12, n, #3 + cmp n, #4 + addgt r12, r12, #4 + add pc, pc, r12, lsl #2 + nop + b L(4) + b L(1) + b L(2) + b L(3) + b L(0m4) + b L(1m4) + b L(2m4) + b L(3m4) + + +L(1m4): push {r4-r11, r14} + mov n_saved, n + sub i, n, #4 + sub n, n, #2 + add r10, pc, #L(am2_2m4)-.-8 + ldm up, {v0,v1,u0} + sub up, up, #4 + mov cyb, #0 + mov r5, #0 + umull r4, cya, v1, v0 + str r4, [rp], #-12 + mov r4, #0 + b L(ko0) + +L(3m4): push {r4-r11, r14} + mov n_saved, n + sub i, n, #4 + sub n, n, #2 + add r10, pc, #L(am2_0m4)-.-8 + ldm up, {v0,v1,u0} + add up, up, #4 + mov cyb, #0 + mov r5, #0 + umull r4, cya, v1, v0 + str r4, [rp], #-4 + mov r4, #0 + b L(ko2) + +L(2m4): push {r4-r11, r14} + mov n_saved, n + sub i, n, #4 + sub n, n, #2 + add r10, pc, #L(am2_3m4)-.-8 + ldm up, {v0,v1,u1} + mov cyb, #0 + mov r4, #0 + umull r5, cya, v1, v0 + str r5, [rp], #-8 + mov r5, #0 + b L(ko1) + +L(0m4): push {r4-r11, r14} + mov n_saved, n + sub i, n, #4 + sub n, n, #2 + add r10, pc, #L(am2_1m4)-.-8 + ldm up, {v0,v1,u1} + mov cyb, #0 + mov r4, #0 + add up, up, #8 + umull r5, cya, v1, v0 + str r5, [rp, #0] + mov r5, #0 + +L(top): ldr u0, [up, #4] + umaal r4, cya, u1, v0 + str r4, [rp, #4] + mov r4, #0 + umaal r5, cyb, u1, v1 +L(ko2): ldr u1, [up, #8] + umaal r5, cya, u0, v0 + str r5, [rp, #8] + mov r5, #0 + umaal r4, cyb, u0, v1 +L(ko1): ldr u0, [up, #12] + umaal r4, cya, u1, v0 + str r4, [rp, #12] + mov r4, #0 + umaal r5, cyb, u1, v1 +L(ko0): ldr u1, [up, #16]! + umaal r5, cya, u0, v0 + str r5, [rp, #16]! + mov r5, #0 + umaal r4, cyb, u0, v1 + subs i, i, #4 + bhi L(top) + + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #4] + umaal r5, cya, u0, v0 + umaal cya, cyb, u0, v1 + str r5, [rp, #8] + str cya, [rp, #12] + str cyb, [rp, #16] + + add up, up, #4 + sub n, n, #1 + add rp, rp, #8 + bx r10 + +L(evnloop): + subs i, n, #6 + sub n, n, #2 + blt L(cor2) + ldm up, {v0,v1,u1} + add up, up, #8 + mov cya, #0 + mov cyb, #0 + ldr r4, [rp, #-4] + umaal r4, cya, v1, v0 + str r4, [rp, #-4] + ldr r4, [rp, #0] + + ALIGN(16) +L(ua2): ldr r5, [rp, #4] + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #0] + ldr r4, [rp, #8] + umaal r5, cya, u0, v0 + ldr u1, [up, #8] + umaal r4, cyb, u0, v1 + str r5, [rp, #4] + ldr r5, [rp, #12] + umaal r4, cya, u1, v0 + ldr u0, [up, #12] + umaal r5, cyb, u1, v1 + str r4, [rp, #8] + ldr r4, [rp, #16]! + umaal r5, cya, u0, v0 + ldr u1, [up, #16]! + umaal r4, cyb, u0, v1 + str r5, [rp, #-4] + subs i, i, #4 + bhs L(ua2) + + umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #0] + str cya, [rp, #4] + str cyb, [rp, #8] +L(am2_0m4): + sub rp, rp, n, lsl #2 + sub up, up, n, lsl #2 + add rp, rp, #8 + + sub i, n, #4 + sub n, n, #2 + ldm up, {v0,v1,u1} + mov cya, #0 + mov cyb, #0 + ldr r4, [rp, #4] + umaal r4, cya, v1, v0 + str r4, [rp, #4] + ldr r4, [rp, #8] + b L(lo0) + + ALIGN(16) +L(ua0): ldr r5, [rp, #4] + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #0] + ldr r4, [rp, #8] + umaal r5, cya, u0, v0 + ldr u1, [up, #8] + umaal r4, cyb, u0, v1 + str r5, [rp, #4] +L(lo0): ldr r5, [rp, #12] + umaal r4, cya, u1, v0 + ldr u0, [up, #12] + umaal r5, cyb, u1, v1 + str r4, [rp, #8] + ldr r4, [rp, #16]! + umaal r5, cya, u0, v0 + ldr u1, [up, #16]! + umaal r4, cyb, u0, v1 + str r5, [rp, #-4] + subs i, i, #4 + bhs L(ua0) + + umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #0] + str cya, [rp, #4] + str cyb, [rp, #8] +L(am2_2m4): + sub rp, rp, n, lsl #2 + sub up, up, n, lsl #2 + add rp, rp, #16 + b L(evnloop) + + +L(oddloop): + sub i, n, #5 + sub n, n, #2 + ldm up, {v0,v1,u0} + mov cya, #0 + mov cyb, #0 + ldr r5, [rp, #0] + umaal r5, cya, v1, v0 + str r5, [rp, #0] + ldr r5, [rp, #4] + add up, up, #4 + b L(lo1) + + ALIGN(16) +L(ua1): ldr r5, [rp, #4] + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #0] +L(lo1): ldr r4, [rp, #8] + umaal r5, cya, u0, v0 + ldr u1, [up, #8] + umaal r4, cyb, u0, v1 + str r5, [rp, #4] + ldr r5, [rp, #12] + umaal r4, cya, u1, v0 + ldr u0, [up, #12] + umaal r5, cyb, u1, v1 + str r4, [rp, #8] + ldr r4, [rp, #16]! + umaal r5, cya, u0, v0 + ldr u1, [up, #16]! + umaal r4, cyb, u0, v1 + str r5, [rp, #-4] + subs i, i, #4 + bhs L(ua1) + + umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #0] + str cya, [rp, #4] + str cyb, [rp, #8] +L(am2_3m4): + sub rp, rp, n, lsl #2 + sub up, up, n, lsl #2 + add rp, rp, #4 + + subs i, n, #3 + beq L(cor3) + sub n, n, #2 + ldm up, {v0,v1,u0} + mov cya, #0 + mov cyb, #0 + ldr r5, [rp, #8] + sub up, up, #4 + umaal r5, cya, v1, v0 + str r5, [rp, #8] + ldr r5, [rp, #12] + b L(lo3) + + ALIGN(16) +L(ua3): ldr r5, [rp, #4] + umaal r4, cya, u1, v0 + ldr u0, [up, #4] + umaal r5, cyb, u1, v1 + str r4, [rp, #0] + ldr r4, [rp, #8] + umaal r5, cya, u0, v0 + ldr u1, [up, #8] + umaal r4, cyb, u0, v1 + str r5, [rp, #4] + ldr r5, [rp, #12] + umaal r4, cya, u1, v0 + ldr u0, [up, #12] + umaal r5, cyb, u1, v1 + str r4, [rp, #8] +L(lo3): ldr r4, [rp, #16]! + umaal r5, cya, u0, v0 + ldr u1, [up, #16]! + umaal r4, cyb, u0, v1 + str r5, [rp, #-4] + subs i, i, #4 + bhs L(ua3) + + umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #0] + str cya, [rp, #4] + str cyb, [rp, #8] +L(am2_1m4): + sub rp, rp, n, lsl #2 + sub up, up, n, lsl #2 + add rp, rp, #12 + b L(oddloop) + + +L(cor3):ldm up, {v0,v1,u0} + ldr r5, [rp, #8] + mov cya, #0 + mov cyb, #0 + umaal r5, cya, v1, v0 + str r5, [rp, #8] + ldr r5, [rp, #12] + ldr r4, [rp, #16] + umaal r5, cya, u0, v0 + ldr u1, [up, #12] + umaal r4, cyb, u0, v1 + str r5, [rp, #12] + umaal r4, cya, u1, v0 + umaal cya, cyb, u1, v1 + str r4, [rp, #16] + str cya, [rp, #20] + str cyb, [rp, #24] + add up, up, #16 + mov cya, cyb + adds rp, rp, #36 C clear cy + mov cyb, #0 + umaal cya, cyb, u1, u0 + b L(sqr_diag_addlsh1) + +L(cor2): + ldm up!, {v0,v1,u0} + mov r4, cya + mov r5, cyb + mov cya, #0 + umaal r4, cya, v1, v0 + mov cyb, #0 + umaal r5, cya, u0, v0 + strd r4, r5, [rp, #-4] + umaal cya, cyb, u0, v1 + add rp, rp, #16 +C b L(sqr_diag_addlsh1) + + +define(`w0', r6) +define(`w1', r7) +define(`w2', r8) +define(`rbx', r9) + +L(sqr_diag_addlsh1): + str cya, [rp, #-12] + str cyb, [rp, #-8] + sub n, n_saved, #1 + sub up, up, n_saved, lsl #2 + sub rp, rp, n_saved, lsl #3 + ldr r3, [up], #4 + umull w1, r5, r3, r3 + mov w2, #0 + mov r10, #0 +C cmn r0, #0 C clear cy (already clear) + b L(lm) + +L(tsd): adds w0, w0, rbx + adcs w1, w1, r4 + str w0, [rp, #0] +L(lm): ldr w0, [rp, #4] + str w1, [rp, #4] + ldr w1, [rp, #8]! + add rbx, r5, w2 + adcs w0, w0, w0 + ldr r3, [up], #4 + adcs w1, w1, w1 + adc w2, r10, r10 + umull r4, r5, r3, r3 + subs n, n, #1 + bne L(tsd) + + adds w0, w0, rbx + adcs w1, w1, r4 + adc w2, r5, w2 + stm rp, {w0,w1,w2} + + pop {r4-r11, pc} + + +C Straight line code for n <= 4 + +L(1): ldr r3, [up, #0] + umull r1, r2, r3, r3 + stm rp, {r1,r2} + bx r14 + +L(2): push {r4-r5} + ldm up, {r5,r12} + umull r1, r2, r5, r5 + umull r3, r4, r12, r12 + umull r5, r12, r5, r12 + adds r5, r5, r5 + adcs r12, r12, r12 + adc r4, r4, #0 + adds r2, r2, r5 + adcs r3, r3, r12 + adc r4, r4, #0 + stm rp, {r1,r2,r3,r4} + pop {r4-r5} + bx r14 + +L(3): push {r4-r11} + ldm up, {r7,r8,r9} + umull r1, r2, r7, r7 + umull r3, r4, r8, r8 + umull r5, r6, r9, r9 + umull r10, r11, r7, r8 + mov r12, #0 + umlal r11, r12, r7, r9 + mov r7, #0 + umlal r12, r7, r8, r9 + adds r10, r10, r10 + adcs r11, r11, r11 + adcs r12, r12, r12 + adcs r7, r7, r7 + adc r6, r6, #0 + adds r2, r2, r10 + adcs r3, r3, r11 + adcs r4, r4, r12 + adcs r5, r5, r7 + adc r6, r6, #0 + stm rp, {r1,r2,r3,r4,r5,r6} + pop {r4-r11} + bx r14 + +L(4): push {r4-r11, r14} + ldm up, {r9,r10,r11,r12} + umull r1, r2, r9, r9 + umull r3, r4, r10, r10 + umull r5, r6, r11, r11 + umull r7, r8, r12, r12 + stm rp, {r1,r2,r3,r4,r5,r6,r7} + umull r1, r2, r9, r10 + mov r3, #0 + umlal r2, r3, r9, r11 + mov r4, #0 + umlal r3, r4, r9, r12 + mov r5, #0 + umlal r3, r5, r10, r11 + umaal r4, r5, r10, r12 + mov r6, #0 + umlal r5, r6, r11, r12 + adds r1, r1, r1 + adcs r2, r2, r2 + adcs r3, r3, r3 + adcs r4, r4, r4 + adcs r5, r5, r5 + adcs r6, r6, r6 + add rp, rp, #4 + adc r7, r8, #0 + ldm rp, {r8,r9,r10,r11,r12,r14} + adds r1, r1, r8 + adcs r2, r2, r9 + adcs r3, r3, r10 + adcs r4, r4, r11 + adcs r5, r5, r12 + adcs r6, r6, r14 + adc r7, r7, #0 + stm rp, {r1,r2,r3,r4,r5,r6,r7} + pop {r4-r11, pc} +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6/submul_1.asm b/gmp-6.3.0/mpn/arm/v6/submul_1.asm new file mode 100644 index 0000000..8a21733 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6/submul_1.asm @@ -0,0 +1,125 @@ +dnl ARM mpn_submul_1. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM: - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 3.75 +C Cortex-A15 4.0 + +C This loop complements U on the fly, +C U' = B^n - 1 - U +C and then uses that +C R - U*v = R + U'*v + v - B^n v + +C TODO +C * Micro-optimise feed-in code. +C * Optimise for n=1,2 by delaying register saving. +C * Try using ldm/stm. + +define(`rp',`r0') +define(`up',`r1') +define(`n', `r2') +define(`v0',`r3') + +ASM_START() +PROLOGUE(mpn_submul_1) + stmfd sp!, { r4, r5, r6, r7 } + + ands r6, n, #3 + mov r12, v0 + beq L(fi0) + cmp r6, #2 + bcc L(fi1) + beq L(fi2) + +L(fi3): ldr r4, [up], #12 + mvn r4, r4 + ldr r6, [rp, #0] + ldr r5, [up, #-8] + b L(lo3) + +L(fi0): ldr r5, [up], #16 + mvn r5, r5 + ldr r7, [rp], #4 + ldr r4, [up, #-12] + b L(lo0) + +L(fi1): ldr r4, [up], #4 + mvn r4, r4 + ldr r6, [rp], #8 + subs n, n, #1 + beq L(1) + ldr r5, [up] + b L(lo1) + +L(fi2): ldr r5, [up], #8 + mvn r5, r5 + ldr r7, [rp], #12 + ldr r4, [up, #-4] + b L(lo2) + + ALIGN(16) +L(top): ldr r6, [rp, #-8] + ldr r5, [up] + str r7, [rp, #-12] +L(lo1): umaal r6, r12, r4, v0 + add up, up, #16 + mvn r5, r5 + ldr r7, [rp, #-4] + ldr r4, [up, #-12] + str r6, [rp, #-8] +L(lo0): umaal r7, r12, r5, v0 + mvn r4, r4 + ldr r6, [rp, #0] + ldr r5, [up, #-8] + str r7, [rp, #-4] +L(lo3): umaal r6, r12, r4, v0 + mvn r5, r5 + ldr r7, [rp, #4] + ldr r4, [up, #-4] + str r6, [rp], #16 +L(lo2): umaal r7, r12, r5, v0 + mvn r4, r4 + subs n, n, #4 + bhi L(top) + + ldr r6, [rp, #-8] + str r7, [rp, #-12] +L(1): umaal r6, r12, r4, v0 + str r6, [rp, #-8] + sub r0, v0, r12 + ldmfd sp!, { r4, r5, r6, r7 } + bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6t2/divrem_1.asm b/gmp-6.3.0/mpn/arm/v6t2/divrem_1.asm new file mode 100644 index 0000000..be24615 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6t2/divrem_1.asm @@ -0,0 +1,212 @@ +dnl ARM v6t2 mpn_divrem_1 and mpn_preinv_divrem_1. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C norm unorm frac +C StrongARM - - - +C XScale - - - +C Cortex-A7 ? ? ? +C Cortex-A8 ? ? ? +C Cortex-A9 13 14 13 +C Cortex-A15 11.4 11.8 11.1 + +C TODO +C * Optimise inner-loops better, they could likely run a cycle or two faster. +C * Decrease register usage, streamline non-loop code. + +define(`qp_arg', `r0') +define(`fn', `r1') +define(`up_arg', `r2') +define(`n_arg', `r3') +define(`d_arg', `0') +define(`dinv_arg',`4') +define(`cnt_arg', `8') + +define(`n', `r9') +define(`qp', `r5') +define(`up', `r6') +define(`cnt', `r7') +define(`tnc', `r10') +define(`dinv', `r0') +define(`d', `r4') + +ASM_START() +PROLOGUE(mpn_preinv_divrem_1) + stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr} + ldr d, [sp, #9*4+d_arg] + ldr cnt, [sp, #9*4+cnt_arg] + str r1, [sp, #9*4+d_arg] C reuse d stack slot for fn + sub n, r3, #1 + add r3, r1, n + cmp d, #0 + add qp, qp_arg, r3, lsl #2 C put qp at Q[] end + add up, up_arg, n, lsl #2 C put up at U[] end + ldr dinv, [sp, #9*4+dinv_arg] + blt L(nent) + b L(uent) +EPILOGUE() + +PROLOGUE(mpn_divrem_1) + stmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, lr} + sub n, r3, #1 + ldr d, [sp, #9*4+d_arg] C d + str r1, [sp, #9*4+d_arg] C reuse d stack slot for fn + add r3, r1, n + cmp d, #0 + add qp, qp_arg, r3, lsl #2 C put qp at Q[] end + add up, up_arg, n, lsl #2 C put up at U[] end + blt L(normalised) + +L(unnorm): + clz cnt, d + mov r0, d, lsl cnt C pass d << cnt + bl mpn_invert_limb +L(uent): + mov d, d, lsl cnt C d <<= cnt + cmp n, #0 + mov r1, #0 C r + blt L(frac) + + ldr r11, [up, #0] + + rsb tnc, cnt, #32 + mov r1, r11, lsr tnc + mov r11, r11, lsl cnt + beq L(uend) + + ldr r3, [up, #-4]! + orr r2, r11, r3, lsr tnc + b L(mid) + +L(utop): + mls r1, d, r8, r11 + mov r11, r3, lsl cnt + ldr r3, [up, #-4]! + cmp r1, r2 + addhi r1, r1, d + subhi r8, r8, #1 + orr r2, r11, r3, lsr tnc + cmp r1, d + bcs L(ufx) +L(uok): str r8, [qp], #-4 +L(mid): add r8, r1, #1 + mov r11, r2 + umlal r2, r8, r1, dinv + subs n, n, #1 + bne L(utop) + + mls r1, d, r8, r11 + mov r11, r3, lsl cnt + cmp r1, r2 + addhi r1, r1, d + subhi r8, r8, #1 + cmp r1, d + rsbcs r1, d, r1 + addcs r8, r8, #1 + str r8, [qp], #-4 + +L(uend):add r8, r1, #1 + mov r2, r11 + umlal r2, r8, r1, dinv + mls r1, d, r8, r11 + cmp r1, r2 + addhi r1, r1, d + subhi r8, r8, #1 + cmp r1, d + rsbcs r1, d, r1 + addcs r8, r8, #1 + str r8, [qp], #-4 +L(frac): + ldr r2, [sp, #9*4+d_arg] C fn + cmp r2, #0 + beq L(fend) + +L(ftop):mov r6, #0 + add r3, r1, #1 + umlal r6, r3, r1, dinv + mov r8, #0 + mls r1, d, r3, r8 + cmp r1, r6 + addhi r1, r1, d + subhi r3, r3, #1 + subs r2, r2, #1 + str r3, [qp], #-4 + bne L(ftop) + +L(fend):mov r11, r1, lsr cnt +L(rtn): mov r0, r11 + ldmfd sp!, {r4, r5, r6, r7, r8, r9, r10, r11, pc} + +L(normalised): + mov r0, d + bl mpn_invert_limb +L(nent): + cmp n, #0 + mov r11, #0 C r + blt L(nend) + + ldr r11, [up, #0] + cmp r11, d + movlo r2, #0 C hi q limb + movhs r2, #1 C hi q limb + subhs r11, r11, d + + str r2, [qp], #-4 + cmp n, #0 + beq L(nend) + +L(ntop):ldr r1, [up, #-4]! + add r12, r11, #1 + umlal r1, r12, r11, dinv + ldr r3, [up, #0] + mls r11, d, r12, r3 + cmp r11, r1 + addhi r11, r11, d + subhi r12, r12, #1 + cmp d, r11 + bls L(nfx) +L(nok): str r12, [qp], #-4 + subs n, n, #1 + bne L(ntop) + +L(nend):mov r1, r11 C r + mov cnt, #0 C shift cnt + b L(frac) + +L(nfx): add r12, r12, #1 + rsb r11, d, r11 + b L(nok) +L(ufx): rsb r1, d, r1 + add r8, r8, #1 + b L(uok) +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6t2/gcd_11.asm b/gmp-6.3.0/mpn/arm/v6t2/gcd_11.asm new file mode 100644 index 0000000..8a38351 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6t2/gcd_11.asm @@ -0,0 +1,65 @@ +dnl ARM v6t2 mpn_gcd_11. + +dnl Copyright 2000-2002, 2005, 2009, 2011, 2012, 2019 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/bit (approx) +C StrongARM - +C XScale - +C Cortex-A5 5.2 +C Cortex-A7 5.04 +C Cortex-A8 3.59 +C Cortex-A9 9.5 +C Cortex-A15 3.2 +C Cortex-A17 5.25 +C Cortex-A53 3.57 + +define(`u0', `r0') +define(`v0', `r1') + +ASM_START() + TEXT + ALIGN(64) +PROLOGUE(mpn_gcd_11) + subs r3, u0, v0 C 0 + beq L(end) C + + ALIGN(16) +L(top): rbit r12, r3 C 1,5 + clz r12, r12 C 2 + rsbcc r3, r3, #0 C v = abs(u-v), even 1 + movcs u0, v0 C u = min(u,v) 1 + lsr v0, r3, r12 C 3 + subs r3, u0, v0 C 4 + bne L(top) C + +L(end): bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v6t2/gcd_22.asm b/gmp-6.3.0/mpn/arm/v6t2/gcd_22.asm new file mode 100644 index 0000000..3b23808 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v6t2/gcd_22.asm @@ -0,0 +1,113 @@ +dnl ARM v6t2 mpn_gcd_22. + +dnl Copyright 2019 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/bit (approx) +C StrongARM - +C XScale - +C Cortex-A5 10.1 +C Cortex-A7 9.1 +C Cortex-A8 6.3 +C Cortex-A9 ? +C Cortex-A12 7.7 +C Cortex-A15 5.7 +C Cortex-A17 ? +C Cortex-A53 7.0 + + +define(`gp', `r0') + +define(`u1', `r1') +define(`u0', `r2') +define(`v1', `r3') +define(`v0', `r4') + +define(`t0', `r5') +define(`t1', `r6') +define(`cnt', `r7') + +ASM_START() +PROLOGUE(mpn_gcd_22) + push { r4-r7 } + + ldr v0, [sp,#16] C + +L(top): subs t0, u0, v0 C 0 7 + beq L(lowz) + sbcs t1, u1, v1 C 1 8 + + rbit cnt, t0 C 1 + + negcc t0, t0 + mvncc t1, t1 +L(bck): movcc v0, u0 + movcc v1, u1 + + clz cnt, cnt C 2 + rsb r12, cnt, #32 C 3 + + lsr u0, t0, cnt C 3 + lsl r12, t1, r12 C 4 + lsr u1, t1, cnt C 3 + orr u0, u0, r12 C 5 + + orrs r12, u1, v1 + bne L(top) + + + str r12, [gp,#4] C high result limb <= 0 + + mov r6, gp + mov r0, u0 C pass 1st argument + mov r1, v0 C pass 2nd argument + mov r7, r14 C preserve link register + bl mpn_gcd_11 + str r0, [r6,#0] + mov r14, r7 + pop { r4-r7 } + bx r14 + +L(lowz):C We come here when v0 - u0 = 0 + C 1. If v1 - u1 = 0, then gcd is u = v. + C 2. Else compute gcd_21({v1,v0}, |u1-v1|) + subs t0, u1, v1 + beq L(end) + mov t1, #0 + rbit cnt, t0 C 1 + negcc t0, t0 + b L(bck) + +L(end): str v0, [gp,#0] + str v1, [gp,#4] + pop { r4-r7 } + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/addmul_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/addmul_1.asm new file mode 100644 index 0000000..c2277b3 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/addmul_1.asm @@ -0,0 +1,145 @@ +dnl ARM mpn_addmul_1 optimised for A15. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C StrongARM: - +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 6 3.25 +C Cortex-A15 2 this + +C This code uses umlal for adding in the rp[] data, keeping the recurrency path +C separate from any multiply instructions. It performs well on A15, at umlal's +C bandwidth. +C +C An A9 variant should perhaps stick to 3-way unrolling, and use ldm and stm +C for all loads and stores. Alternatively, it could do 2-way or 4-way, but +C then alignment aware code will be necessary (adding O(1) bookkeeping +C overhead). +C +C We don't use r12 due to ldrd and strd limitations. + +C Architecture requirements: +C v5 - +C v5t - +C v5te ldrd strd +C v6 - +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`v0', `r3') + +define(`w0', `r10') define(`w1', `r11') +define(`u0', `r8') define(`u1', `r9') + +ASM_START() +PROLOGUE(mpn_addmul_1) + push { r4-r11 } + + ands r6, n, #3 + sub n, n, #3 + beq L(b00) + cmp r6, #2 + bcc L(b01) + beq L(b10) + +L(b11): mov r6, #0 + cmn r13, #0 C carry clear + ldr u1, [up], #-4 + ldr w1, [rp], #-4 + mov r7, #0 + b L(mid) + +L(b00): ldrd u0, u1, [up] + ldrd w0, w1, [rp] + mov r6, #0 + umlal w0, r6, u0, v0 + cmn r13, #0 C carry clear + mov r7, #0 + str w0, [rp] + b L(mid) + +L(b10): ldrd u0, u1, [up], #8 + ldrd w0, w1, [rp] + mov r4, #0 + umlal w0, r4, u0, v0 + cmn r13, #0 C carry clear + mov r5, #0 + str w0, [rp], #8 + umlal w1, r5, u1, v0 + tst n, n + bmi L(end) + b L(top) + +L(b01): mov r4, #0 + ldr u1, [up], #4 + ldr w1, [rp], #4 + mov r5, #0 + umlal w1, r5, u1, v0 + tst n, n + bmi L(end) + + ALIGN(16) +L(top): ldrd u0, u1, [up, #0] + adcs r4, r4, w1 + ldrd w0, w1, [rp, #0] + mov r6, #0 + umlal w0, r6, u0, v0 C 1 2 + adcs r5, r5, w0 + mov r7, #0 + strd r4, r5, [rp, #-4] +L(mid): umlal w1, r7, u1, v0 C 2 3 + ldrd u0, u1, [up, #8] + adcs r6, r6, w1 + ldrd w0, w1, [rp, #8] + mov r4, #0 + umlal w0, r4, u0, v0 C 3 4 + adcs r7, r7, w0 + mov r5, #0 + strd r6, r7, [rp, #4] + umlal w1, r5, u1, v0 C 0 1 + sub n, n, #4 + add up, up, #16 + add rp, rp, #16 + tst n, n + bpl L(top) + +L(end): adcs r4, r4, w1 + str r4, [rp, #-4] + adc r0, r5, #0 + pop { r4-r11 } + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/aors_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/aors_n.asm new file mode 100644 index 0000000..dc3f839 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/aors_n.asm @@ -0,0 +1,162 @@ +dnl ARM mpn_add_n/mpn_sub_n optimised for A15. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C StrongARM: - +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 3.55 2.5 +C Cortex-A15 1.27 this + +C This was a major improvement compared to the code we had before, but it might +C not be the best 8-way code possible. We've tried some permutations of auto- +C increments and separate pointer updates, but they all ran at the same speed +C on A15. + +C Architecture requirements: +C v5 - +C v5t - +C v5te ldrd strd +C v6 - +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +ifdef(`OPERATION_add_n', ` + define(`ADDSUBC', adcs) + define(`IFADD', `$1') + define(`SETCY', `cmp $1, #1') + define(`RETVAL', `adc r0, n, #0') + define(`RETVAL2', `adc r0, n, #1') + define(`func', mpn_add_n) + define(`func_nc', mpn_add_nc)') +ifdef(`OPERATION_sub_n', ` + define(`ADDSUBC', sbcs) + define(`IFADD', `') + define(`SETCY', `rsbs $1, $1, #0') + define(`RETVAL', `sbc r0, r0, r0 + and r0, r0, #1') + define(`RETVAL2', `RETVAL') + define(`func', mpn_sub_n) + define(`func_nc', mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) + +ASM_START() +PROLOGUE(func_nc) + ldr r12, [sp] + b L(ent) +EPILOGUE() +PROLOGUE(func) + mov r12, #0 +L(ent): push { r4-r9 } + + ands r6, n, #3 + mov n, n, lsr #2 + beq L(b00) + cmp r6, #2 + bcc L(b01) + beq L(b10) + +L(b11): ldr r5, [up], #4 + ldr r7, [vp], #4 + SETCY( r12) + ADDSUBC r9, r5, r7 + ldrd r4, r5, [up, #0] + ldrd r6, r7, [vp, #0] + str r9, [rp], #-4 + b L(lo) + +L(b00): ldrd r4, r5, [up], #-8 + ldrd r6, r7, [vp], #-8 + SETCY( r12) + sub rp, rp, #16 + b L(mid) + +L(b01): ldr r5, [up], #-4 + ldr r7, [vp], #-4 + SETCY( r12) + ADDSUBC r9, r5, r7 + str r9, [rp], #-12 + tst n, n + beq L(wd1) +L(gt1): ldrd r4, r5, [up, #8] + ldrd r6, r7, [vp, #8] + b L(mid) + +L(b10): ldrd r4, r5, [up] + ldrd r6, r7, [vp] + SETCY( r12) + sub rp, rp, #8 + b L(lo) + + ALIGN(16) +L(top): ldrd r4, r5, [up, #8] + ldrd r6, r7, [vp, #8] + strd r8, r9, [rp, #8] +L(mid): ADDSUBC r8, r4, r6 + ADDSUBC r9, r5, r7 + ldrd r4, r5, [up, #16] + ldrd r6, r7, [vp, #16] + strd r8, r9, [rp, #16] + ADDSUBC r8, r4, r6 + ADDSUBC r9, r5, r7 + sub n, n, #2 + tst n, n + bmi L(dne) + ldrd r4, r5, [up, #24] + ldrd r6, r7, [vp, #24] + strd r8, r9, [rp, #24] + ADDSUBC r8, r4, r6 + ADDSUBC r9, r5, r7 + ldrd r4, r5, [up, #32]! + ldrd r6, r7, [vp, #32]! + strd r8, r9, [rp, #32]! +L(lo): ADDSUBC r8, r4, r6 + ADDSUBC r9, r5, r7 + tst n, n + bne L(top) + +L(end): strd r8, r9, [rp, #8] +L(wd1): RETVAL + pop { r4-r9 } + bx r14 +L(dne): strd r8, r9, [rp, #24] + RETVAL2 + pop { r4-r9 } + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/bdiv_q_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/bdiv_q_1.asm new file mode 100644 index 0000000..245b371 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/bdiv_q_1.asm @@ -0,0 +1,36 @@ +dnl ARM mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) +include_mpn(`arm/v7a/cora8/bdiv_q_1.asm') diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/cnd_aors_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/cnd_aors_n.asm new file mode 100644 index 0000000..b9e5cd3 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/cnd_aors_n.asm @@ -0,0 +1,158 @@ +dnl ARM mpn_cnd_add_n/mpn_cnd_sub_n optimised for A15. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C StrongARM: - +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 3.75 3 +C Cortex-A15 1.78 this + +C This code does not run as well as one could have hoped, since 1.5 c/l seems +C realistic for this insn mix. + +C Architecture requirements: +C v5 - +C v5t - +C v5te ldrd strd +C v6 - +C v6t2 - +C v7a - + +define(`cnd',`r0') +define(`rp', `r1') +define(`up', `r2') +define(`vp', `r3') +define(`n', `r12') + +ifdef(`OPERATION_cnd_add_n', ` + define(`ADDSUB', adds) + define(`ADDSUBC', adcs) + define(`IFADD', `$1') + define(`INITCY', `cmn r0, #0') + define(`RETVAL', `adc r0, n, #0') + define(`RETVAL2', `adc r0, n, #1') + define(`func', mpn_cnd_add_n) + define(`func_nc', mpn_add_nc)') +ifdef(`OPERATION_cnd_sub_n', ` + define(`ADDSUB', subs) + define(`ADDSUBC', sbcs) + define(`IFADD', `') + define(`INITCY', `cmp r0, #0') + define(`RETVAL', `sbc r0, r0, r0 + and r0, r0, #1') + define(`RETVAL2', `RETVAL') + define(`func', mpn_cnd_sub_n) + define(`func_nc', mpn_sub_nc)') + +MULFUNC_PROLOGUE(mpn_cnd_add_n mpn_cnd_sub_n) + +ASM_START() +PROLOGUE(func) + ldr n, [sp] + push { r4-r9 } + + cmp cnd, #1 + sbc cnd, cnd, cnd C conditionally set to 0xffffffff + + ands r6, n, #3 + mov n, n, lsr #2 + beq L(b00) + cmp r6, #2 + bcc L(b01) + beq L(b10) + +L(b11): ldr r5, [up], #4 + ldr r7, [vp], #4 + bic r7, r7, cnd + ADDSUB r9, r5, r7 + ldrd r4, r5, [up, #0] + ldrd r6, r7, [vp, #0] + bic r6, r6, cnd + bic r7, r7, cnd + str r9, [rp], #-4 + b L(lo) + +L(b00): ldrd r4, r5, [up], #-8 + ldrd r6, r7, [vp], #-8 + bic r6, r6, cnd + bic r7, r7, cnd + INITCY + sub rp, rp, #16 + b L(mid) + +L(b01): ldr r5, [up], #-4 + ldr r7, [vp], #-4 + bic r7, r7, cnd + ADDSUB r9, r5, r7 + str r9, [rp], #-12 + tst n, n + beq L(wd1) +L(gt1): ldrd r4, r5, [up, #8] + ldrd r6, r7, [vp, #8] + bic r6, r6, cnd + bic r7, r7, cnd + b L(mid) + +L(b10): ldrd r4, r5, [up] + ldrd r6, r7, [vp] + bic r6, r6, cnd + bic r7, r7, cnd + INITCY + sub rp, rp, #8 + b L(lo) + + ALIGN(16) +L(top): ldrd r6, r7, [vp, #8] + ldrd r4, r5, [up, #8] + bic r6, r6, cnd + bic r7, r7, cnd + strd r8, r9, [rp, #8] +L(mid): ADDSUBC r8, r4, r6 + ADDSUBC r9, r5, r7 + ldrd r6, r7, [vp, #16]! + ldrd r4, r5, [up, #16]! + bic r6, r6, cnd + bic r7, r7, cnd + sub n, n, #1 + strd r8, r9, [rp, #16]! +L(lo): ADDSUBC r8, r4, r6 + ADDSUBC r9, r5, r7 + tst n, n + bne L(top) + +L(end): strd r8, r9, [rp, #8] +L(wd1): RETVAL + pop { r4-r9 } + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/com.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/com.asm new file mode 100644 index 0000000..a258afe --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/com.asm @@ -0,0 +1,180 @@ +dnl ARM mpn_com optimised for A15. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 2.5 +C Cortex-A15 1.0 + +C This is great A15 core register code, but it is a bit large. +C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling. + +C Architecture requirements: +C v5 - +C v5t - +C v5te ldrd strd +C v6 - +C v6t2 - +C v7a - + +define(`FEEDIN_VARIANT', 1) C alternatives: 0 1 2 +define(`UNROLL', 4x2) C alternatives: 4 4x2 + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_com) + push { r4-r5,r8-r9 } + +ifelse(FEEDIN_VARIANT,0,` + ands r12, n, #3 + mov n, n, lsr #2 + beq L(b00a) + tst r12, #1 + beq L(bx0) + ldr r5, [up], #4 + mvn r9, r5 + str r9, [rp], #4 + tst r12, #2 + beq L(b00) +L(bx0): ldrd r4, r5, [up, #0] + sub rp, rp, #8 + b L(lo) +L(b00): tst n, n + beq L(wd1) +L(b00a):ldrd r4, r5, [up], #-8 + sub rp, rp, #16 + b L(mid) +') +ifelse(FEEDIN_VARIANT,1,` + and r12, n, #3 + mov n, n, lsr #2 + tst r12, #1 + beq L(bx0) + ldr r5, [up], #4 + mvn r9, r5 + str r9, [rp], #4 +L(bx0): tst r12, #2 + beq L(b00) + ldrd r4, r5, [up, #0] + sub rp, rp, #8 + b L(lo) +L(b00): tst n, n + beq L(wd1) + ldrd r4, r5, [up], #-8 + sub rp, rp, #16 + b L(mid) +') +ifelse(FEEDIN_VARIANT,2,` + ands r12, n, #3 + mov n, n, lsr #2 + beq L(b00) + cmp r12, #2 + bcc L(b01) + beq L(b10) + +L(b11): ldr r5, [up], #4 + mvn r9, r5 + ldrd r4, r5, [up, #0] + str r9, [rp], #-4 + b L(lo) + +L(b00): ldrd r4, r5, [up], #-8 + sub rp, rp, #16 + b L(mid) + +L(b01): ldr r5, [up], #-4 + mvn r9, r5 + str r9, [rp], #-12 + tst n, n + beq L(wd1) +L(gt1): ldrd r4, r5, [up, #8] + b L(mid) + +L(b10): ldrd r4, r5, [up] + sub rp, rp, #8 + b L(lo) +') + ALIGN(16) +ifelse(UNROLL,4,` +L(top): ldrd r4, r5, [up, #8] + strd r8, r9, [rp, #8] +L(mid): mvn r8, r4 + mvn r9, r5 + ldrd r4, r5, [up, #16]! + strd r8, r9, [rp, #16]! + sub n, n, #1 +L(lo): mvn r8, r4 + mvn r9, r5 + tst n, n + bne L(top) +') +ifelse(UNROLL,4x2,` +L(top): ldrd r4, r5, [up, #8] + strd r8, r9, [rp, #8] +L(mid): mvn r8, r4 + mvn r9, r5 + ldrd r4, r5, [up, #16] + strd r8, r9, [rp, #16] + mvn r8, r4 + mvn r9, r5 + sub n, n, #2 + tst n, n + bmi L(dne) + ldrd r4, r5, [up, #24] + strd r8, r9, [rp, #24] + mvn r8, r4 + mvn r9, r5 + ldrd r4, r5, [up, #32]! + strd r8, r9, [rp, #32]! +L(lo): mvn r8, r4 + mvn r9, r5 + tst n, n + bne L(top) +') + +L(end): strd r8, r9, [rp, #8] +L(wd1): pop { r4-r5,r8-r9 } + bx r14 +ifelse(UNROLL,4x2,` +L(dne): strd r8, r9, [rp, #24] + pop { r4-r5,r8-r9 } + bx r14 +') +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/gmp-mparam.h b/gmp-6.3.0/mpn/arm/v7a/cora15/gmp-mparam.h new file mode 100644 index 0000000..409cbbb --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/gmp-mparam.h @@ -0,0 +1,212 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 2000 MHz Cortex-A15 with Neon (in spite of file position) */ +/* FFT tuning limit = 50,736,668 */ +/* Generated by tuneup.c, 2019-10-22, gcc 5.4 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 2 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 8 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 49.14% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 17 + +#define DIV_1_VS_MUL_1_PERCENT 267 + +#define MUL_TOOM22_THRESHOLD 28 +#define MUL_TOOM33_THRESHOLD 114 +#define MUL_TOOM44_THRESHOLD 178 +#define MUL_TOOM6H_THRESHOLD 238 +#define MUL_TOOM8H_THRESHOLD 597 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 113 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 115 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 115 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 115 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 154 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 38 +#define SQR_TOOM3_THRESHOLD 126 +#define SQR_TOOM4_THRESHOLD 336 +#define SQR_TOOM6_THRESHOLD 446 +#define SQR_TOOM8_THRESHOLD 650 + +#define MULMID_TOOM42_THRESHOLD 52 + +#define MULMOD_BNM1_THRESHOLD 23 +#define SQRMOD_BNM1_THRESHOLD 17 + +#define MUL_FFT_MODF_THRESHOLD 575 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 575, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 27, 7}, { 15, 6}, { 31, 7}, { 19, 6}, \ + { 39, 7}, { 25, 6}, { 51, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 51, 8}, { 27, 9}, { 15, 8}, \ + { 31, 7}, { 63, 8}, { 39, 9}, { 23, 8}, \ + { 55,10}, { 15, 9}, { 31, 8}, { 67, 9}, \ + { 39, 8}, { 79, 9}, { 47, 8}, { 95, 9}, \ + { 55,10}, { 31, 9}, { 71, 8}, { 143, 9}, \ + { 87,10}, { 47, 9}, { 111,11}, { 31,10}, \ + { 63, 9}, { 143,10}, { 79, 9}, { 159,10}, \ + { 95,11}, { 63,10}, { 143, 9}, { 287,10}, \ + { 159,11}, { 95,10}, { 191,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 335, 9}, \ + { 671,10}, { 367, 9}, { 735,11}, { 191,10}, \ + { 383, 9}, { 799,10}, { 415,11}, { 223,12}, \ + { 127,10}, { 543,11}, { 287,10}, { 575,11}, \ + { 319,10}, { 639,11}, { 351,10}, { 703,12}, \ + { 191,11}, { 383,10}, { 767,11}, { 415,10}, \ + { 831,11}, { 447,13}, { 127,12}, { 255,11}, \ + { 543,10}, { 1087,11}, { 575,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,12}, { 383,11}, \ + { 831,12}, { 447,11}, { 959,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1151,12}, \ + { 639,11}, { 1343,12}, { 703,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 895,14}, { 255,13}, \ + { 511,12}, { 1087,13}, { 639,12}, { 1407,13}, \ + { 767,12}, { 1599,13}, { 895,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \ + { 1279,12}, { 2559,13}, { 1407,14}, { 767,13}, \ + { 1535,12}, { 3135,13}, { 1663,15}, { 511,14}, \ + { 1023,13}, { 2303,14}, { 1279,13}, { 2559,12}, \ + { 5119,13}, { 2687,14}, { 1535,13}, { 3071,12}, \ + { 6143,13}, { 3199,12}, { 6399,14}, { 1791,15}, \ + { 1023,14}, { 2047,13}, { 4095,14}, { 2303,13}, \ + { 4607,12}, { 9215,13}, { 4863,12}, { 9727,14}, \ + { 2559,13}, { 5119,15}, { 1535,14}, { 3071,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 155 +#define MUL_FFT_THRESHOLD 5760 + +#define SQR_FFT_MODF_THRESHOLD 525 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 525, 5}, { 25, 6}, { 27, 7}, { 15, 6}, \ + { 31, 7}, { 17, 6}, { 35, 7}, { 19, 6}, \ + { 39, 7}, { 25, 6}, { 51, 7}, { 27, 8}, \ + { 15, 7}, { 33, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 51, 8}, { 27, 7}, { 55, 9}, \ + { 15, 8}, { 31, 7}, { 63, 8}, { 39, 9}, \ + { 23, 8}, { 51,10}, { 15, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 99, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 95,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 95,11}, { 63,10}, { 143, 9}, \ + { 287, 8}, { 575, 9}, { 303,10}, { 159,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \ + { 191,10}, { 399, 9}, { 799,10}, { 415, 9}, \ + { 831,11}, { 223,10}, { 447,12}, { 127,10}, \ + { 543,11}, { 287,10}, { 575,11}, { 319,10}, \ + { 639,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 799,11}, { 415,10}, { 831,11}, \ + { 447,13}, { 127,11}, { 543,10}, { 1087,11}, \ + { 607,12}, { 319,11}, { 735,12}, { 383,11}, \ + { 831,12}, { 447,11}, { 959,12}, { 511,11}, \ + { 1023,12}, { 575,11}, { 1151,12}, { 639,11}, \ + { 1279,12}, { 703,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 831,11}, { 1663,12}, { 895,14}, \ + { 255,13}, { 511,12}, { 1087,13}, { 639,12}, \ + { 1343,13}, { 767,12}, { 1599,13}, { 895,14}, \ + { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \ + { 2303,13}, { 1279,14}, { 767,13}, { 1535,12}, \ + { 3135,13}, { 1663,15}, { 511,14}, { 1023,13}, \ + { 2047,12}, { 4095,13}, { 2303,14}, { 1279,13}, \ + { 2559,12}, { 5119,14}, { 1535,13}, { 3071,12}, \ + { 6143,13}, { 3199,12}, { 6399,14}, { 1791,15}, \ + { 1023,14}, { 2047,13}, { 4095,14}, { 2303,13}, \ + { 4607,12}, { 9215,13}, { 4863,12}, { 9727,14}, \ + { 2559,15}, { 1535,14}, { 3071,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 154 +#define SQR_FFT_THRESHOLD 5312 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 38 +#define MULLO_MUL_N_THRESHOLD 10950 +#define SQRLO_BASECASE_THRESHOLD 10 +#define SQRLO_DC_THRESHOLD 35 +#define SQRLO_SQR_THRESHOLD 10323 + +#define DC_DIV_QR_THRESHOLD 57 +#define DC_DIVAPPR_Q_THRESHOLD 254 +#define DC_BDIV_QR_THRESHOLD 48 +#define DC_BDIV_Q_THRESHOLD 286 + +#define INV_MULMOD_BNM1_THRESHOLD 55 +#define INV_NEWTON_THRESHOLD 252 +#define INV_APPR_THRESHOLD 252 + +#define BINV_NEWTON_THRESHOLD 372 +#define REDC_1_TO_REDC_2_THRESHOLD 61 +#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ + +#define MU_DIV_QR_THRESHOLD 1858 +#define MU_DIVAPPR_Q_THRESHOLD 1787 +#define MUPI_DIV_QR_THRESHOLD 122 +#define MU_BDIV_QR_THRESHOLD 1528 +#define MU_BDIV_Q_THRESHOLD 1836 + +#define POWM_SEC_TABLE 1,14,200,480,1532 + +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 33 +#define SET_STR_DC_THRESHOLD 104 +#define SET_STR_PRECOMPUTE_THRESHOLD 1120 + +#define FAC_DSC_THRESHOLD 164 +#define FAC_ODD_THRESHOLD 27 + +#define MATRIX22_STRASSEN_THRESHOLD 19 +#define HGCD2_DIV1_METHOD 1 /* 3.70% faster than 3 */ +#define HGCD_THRESHOLD 137 +#define HGCD_APPR_THRESHOLD 157 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 610 +#define GCDEXT_DC_THRESHOLD 443 +#define JACOBI_BASE_METHOD 4 /* 12.66% faster than 1 */ + +/* Tuneup completed successfully, took 69757 seconds */ diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/logops_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/logops_n.asm new file mode 100644 index 0000000..0602614 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/logops_n.asm @@ -0,0 +1,253 @@ +dnl ARM mpn_and_n, mpn_andn_n. mpn_nand_n, etc, optimised for A15. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb cycles/limb +C and andn ior xor nand iorn nior xnor +C StrongARM ? ? +C XScale ? ? +C Cortex-A7 ? ? +C Cortex-A8 ? ? +C Cortex-A9 3.5 3.56 +C Cortex-A15 1.27 1.64 + +C This is great A15 core register code, but it is a bit large. +C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling. + +C Architecture requirements: +C v5 - +C v5t - +C v5te ldrd strd +C v6 - +C v6t2 - +C v7a - + +define(`FEEDIN_VARIANT', 1) C alternatives: 0 1 2 +define(`UNROLL', 4x2) C alternatives: 4 4x2 + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +define(`POSTOP') + +ifdef(`OPERATION_and_n',` + define(`func', `mpn_and_n') + define(`LOGOP', `and $1, $2, $3')') +ifdef(`OPERATION_andn_n',` + define(`func', `mpn_andn_n') + define(`LOGOP', `bic $1, $2, $3')') +ifdef(`OPERATION_nand_n',` + define(`func', `mpn_nand_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `and $1, $2, $3')') +ifdef(`OPERATION_ior_n',` + define(`func', `mpn_ior_n') + define(`LOGOP', `orr $1, $2, $3')') +ifdef(`OPERATION_iorn_n',` + define(`func', `mpn_iorn_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `bic $1, $3, $2')') +ifdef(`OPERATION_nior_n',` + define(`func', `mpn_nior_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `orr $1, $2, $3')') +ifdef(`OPERATION_xor_n',` + define(`func', `mpn_xor_n') + define(`LOGOP', `eor $1, $2, $3')') +ifdef(`OPERATION_xnor_n',` + define(`func', `mpn_xnor_n') + define(`POSTOP', `mvn $1, $1') + define(`LOGOP', `eor $1, $2, $3')') + +MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) + +ASM_START() +PROLOGUE(func) + push { r4-r9 } + +ifelse(FEEDIN_VARIANT,0,` + ands r6, n, #3 + mov n, n, lsr #2 + beq L(b00a) + tst r6, #1 + beq L(bx0) + ldr r5, [up], #4 + ldr r7, [vp], #4 + LOGOP( r9, r5, r7) + POSTOP( r9) + str r9, [rp], #4 + tst r6, #2 + beq L(b00) +L(bx0): ldrd r4, r5, [up, #0] + ldrd r6, r7, [vp, #0] + sub rp, rp, #8 + b L(lo) +L(b00): tst n, n + beq L(wd1) +L(b00a):ldrd r4, r5, [up], #-8 + ldrd r6, r7, [vp], #-8 + sub rp, rp, #16 + b L(mid) +') +ifelse(FEEDIN_VARIANT,1,` + and r6, n, #3 + mov n, n, lsr #2 + tst r6, #1 + beq L(bx0) + ldr r5, [up], #4 + ldr r7, [vp], #4 + LOGOP( r9, r5, r7) + POSTOP( r9) + str r9, [rp], #4 +L(bx0): tst r6, #2 + beq L(b00) + ldrd r4, r5, [up, #0] + ldrd r6, r7, [vp, #0] + sub rp, rp, #8 + b L(lo) +L(b00): tst n, n + beq L(wd1) + ldrd r4, r5, [up], #-8 + ldrd r6, r7, [vp], #-8 + sub rp, rp, #16 + b L(mid) +') +ifelse(FEEDIN_VARIANT,2,` + ands r6, n, #3 + mov n, n, lsr #2 + beq L(b00) + cmp r6, #2 + bcc L(b01) + beq L(b10) + +L(b11): ldr r5, [up], #4 + ldr r7, [vp], #4 + LOGOP( r9, r5, r7) + ldrd r4, r5, [up, #0] + ldrd r6, r7, [vp, #0] + POSTOP( r9) + str r9, [rp], #-4 + b L(lo) + +L(b00): ldrd r4, r5, [up], #-8 + ldrd r6, r7, [vp], #-8 + sub rp, rp, #16 + b L(mid) + +L(b01): ldr r5, [up], #-4 + ldr r7, [vp], #-4 + LOGOP( r9, r5, r7) + POSTOP( r9) + str r9, [rp], #-12 + tst n, n + beq L(wd1) +L(gt1): ldrd r4, r5, [up, #8] + ldrd r6, r7, [vp, #8] + b L(mid) + +L(b10): ldrd r4, r5, [up] + ldrd r6, r7, [vp] + sub rp, rp, #8 + b L(lo) +') + ALIGN(16) +ifelse(UNROLL,4,` +L(top): ldrd r4, r5, [up, #8] + ldrd r6, r7, [vp, #8] + POSTOP( r8) + POSTOP( r9) + strd r8, r9, [rp, #8] +L(mid): LOGOP( r8, r4, r6) + LOGOP( r9, r5, r7) + ldrd r4, r5, [up, #16]! + ldrd r6, r7, [vp, #16]! + POSTOP( r8) + POSTOP( r9) + strd r8, r9, [rp, #16]! + sub n, n, #1 +L(lo): LOGOP( r8, r4, r6) + LOGOP( r9, r5, r7) + tst n, n + bne L(top) +') +ifelse(UNROLL,4x2,` +L(top): ldrd r4, r5, [up, #8] + ldrd r6, r7, [vp, #8] + POSTOP( r8) + POSTOP( r9) + strd r8, r9, [rp, #8] +L(mid): LOGOP( r8, r4, r6) + LOGOP( r9, r5, r7) + ldrd r4, r5, [up, #16] + ldrd r6, r7, [vp, #16] + POSTOP( r8) + POSTOP( r9) + strd r8, r9, [rp, #16] + LOGOP( r8, r4, r6) + LOGOP( r9, r5, r7) + sub n, n, #2 + tst n, n + bmi L(dne) + ldrd r4, r5, [up, #24] + ldrd r6, r7, [vp, #24] + POSTOP( r8) + POSTOP( r9) + strd r8, r9, [rp, #24] + LOGOP( r8, r4, r6) + LOGOP( r9, r5, r7) + ldrd r4, r5, [up, #32]! + ldrd r6, r7, [vp, #32]! + POSTOP( r8) + POSTOP( r9) + strd r8, r9, [rp, #32]! +L(lo): LOGOP( r8, r4, r6) + LOGOP( r9, r5, r7) + tst n, n + bne L(top) +') + +L(end): POSTOP( r8) + POSTOP( r9) + strd r8, r9, [rp, #8] +L(wd1): pop { r4-r9 } + bx r14 +ifelse(UNROLL,4x2,` +L(dne): POSTOP( r8) + POSTOP( r9) + strd r8, r9, [rp, #24] + pop { r4-r9 } + bx r14 +') +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/mul_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/mul_1.asm new file mode 100644 index 0000000..766ba5c --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/mul_1.asm @@ -0,0 +1,104 @@ +dnl ARM mpn_mul_1 optimised for A15. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C StrongARM: - +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 5.25 3.25 +C Cortex-A15 2.25 this + + +C This runs well on A15 but very poorly on A9. By scheduling loads and adds +C it is possible to get good A9 performance as well, but at the cost of using +C many more (callee-saves) registers. + +C This is armv5 code, optimized for the armv7a cpu A15. Its location in the +C GMP file structure might be misleading. + + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`v0', `r3') + +ASM_START() +PROLOGUE(mpn_mul_1c) + ldr r12, [sp] + b L(ent) +EPILOGUE() +PROLOGUE(mpn_mul_1) + mov r12, #0 +L(ent): push {r4-r7} + + ldr r6, [up], #4 + tst n, #1 + beq L(bx0) + +L(bx1): umull r4, r7, r6, v0 + adds r4, r4, r12 + tst n, #2 + beq L(lo1) + b L(lo3) + +L(bx0): umull r4, r5, r6, v0 + adds r4, r4, r12 + tst n, #2 + beq L(lo0) + b L(lo2) + +L(top): ldr r6, [up], #4 + str r4, [rp], #4 + umull r4, r5, r6, v0 + adds r4, r4, r7 +L(lo0): ldr r6, [up], #4 + str r4, [rp], #4 + umull r4, r7, r6, v0 + adcs r4, r4, r5 +L(lo3): ldr r6, [up], #4 + str r4, [rp], #4 + umull r4, r5, r6, v0 + adcs r4, r4, r7 +L(lo2): ldr r6, [up], #4 + str r4, [rp], #4 + umull r4, r7, r6, v0 + adcs r4, r4, r5 +L(lo1): adc r7, r7, #0 + subs n, n, #4 + bgt L(top) + + str r4, [rp] + mov r0, r7 + pop {r4-r7} + bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm new file mode 100644 index 0000000..d8cfe3f --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm @@ -0,0 +1,43 @@ +dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) + +ifdef(`OPERATION_addlsh1_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n) + +include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm new file mode 100644 index 0000000..b48204d --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm @@ -0,0 +1,43 @@ +dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) + +ifdef(`OPERATION_addlsh2_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n) + +include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm new file mode 100644 index 0000000..51f93c1 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm @@ -0,0 +1,144 @@ +dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 5.25 +C Cortex-A15 2.25 + +C TODO +C * Consider using 4-way feed-in code. +C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps +C insufficiently for A7 and A8. + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +ifdef(`DO_add', ` + define(`ADCSBCS', `adcs $1, $2, $3') + define(`CLRCY', `cmn r13, #1') + define(`RETVAL', `adc r0, $1, #0') + define(`func', mpn_addlsh`'LSH`'_n)') +ifdef(`DO_sub', ` + define(`ADCSBCS', `sbcs $1, $2, $3') + define(`CLRCY', `cmp r13, #0') + define(`RETVAL', `sbc $2, $2, $2 + cmn $2, #1 + adc r0, $1, #0') + define(`func', mpn_sublsh`'LSH`'_n)') +ifdef(`DO_rsb', ` + define(`ADCSBCS', `sbcs $1, $3, $2') + define(`CLRCY', `cmp r13, #0') + define(`RETVAL', `sbc r0, $1, #0') + define(`func', mpn_rsblsh`'LSH`'_n)') + + +ASM_START() +PROLOGUE(func) + push {r4-r10} + vmov.i8 d0, #0 C could feed carry through here + CLRCY + tst n, #1 + beq L(bb0) + +L(bb1): vld1.32 {d3[0]}, [vp]! + vsli.u32 d0, d3, #LSH + ldr r12, [up], #4 + vmov.32 r5, d0[0] + vshr.u32 d0, d3, #32-LSH + ADCSBCS( r12, r12, r5) + str r12, [rp], #4 + bics n, n, #1 + beq L(rtn) + +L(bb0): tst n, #2 + beq L(b00) + +L(b10): vld1.32 {d3}, [vp]! + vsli.u64 d0, d3, #LSH + ldmia up!, {r10,r12} + vmov r4, r5, d0 + vshr.u64 d0, d3, #64-LSH + ADCSBCS( r10, r10, r4) + ADCSBCS( r12, r12, r5) + stmia rp!, {r10,r12} + bics n, n, #2 + beq L(rtn) + +L(b00): vld1.32 {d2}, [vp]! + vsli.u64 d0, d2, #LSH + vshr.u64 d1, d2, #64-LSH + vld1.32 {d3}, [vp]! + vsli.u64 d1, d3, #LSH + vmov r6, r7, d0 + vshr.u64 d0, d3, #64-LSH + sub n, n, #4 + tst n, n + beq L(end) + + ALIGN(16) +L(top): ldmia up!, {r8,r9,r10,r12} + vld1.32 {d2}, [vp]! + vsli.u64 d0, d2, #LSH + vmov r4, r5, d1 + vshr.u64 d1, d2, #64-LSH + ADCSBCS( r8, r8, r6) + ADCSBCS( r9, r9, r7) + vld1.32 {d3}, [vp]! + vsli.u64 d1, d3, #LSH + vmov r6, r7, d0 + vshr.u64 d0, d3, #64-LSH + ADCSBCS( r10, r10, r4) + ADCSBCS( r12, r12, r5) + stmia rp!, {r8,r9,r10,r12} + sub n, n, #4 + tst n, n + bne L(top) + +L(end): ldmia up!, {r8,r9,r10,r12} + vmov r4, r5, d1 + ADCSBCS( r8, r8, r6) + ADCSBCS( r9, r9, r7) + ADCSBCS( r10, r10, r4) + ADCSBCS( r12, r12, r5) + stmia rp!, {r8,r9,r10,r12} +L(rtn): vmov.32 r0, d0[0] + RETVAL( r0, r1) + pop {r4-r10} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm new file mode 100644 index 0000000..9e7a629 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm @@ -0,0 +1,97 @@ +dnl ARM Neon mpn_com optimised for A15. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A8 ? +C Cortex-A9 2.1 +C Cortex-A15 0.65 + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_com) + cmp n, #7 + ble L(bc) + +C Perform a few initial operation until rp is 128-bit aligned + tst rp, #4 + beq L(al1) + vld1.32 {d0[0]}, [up]! + sub n, n, #1 + vmvn d0, d0 + vst1.32 {d0[0]}, [rp]! +L(al1): tst rp, #8 + beq L(al2) + vld1.32 {d0}, [up]! + sub n, n, #2 + vmvn d0, d0 + vst1.32 {d0}, [rp:64]! +L(al2): vld1.32 {q2}, [up]! + subs n, n, #12 + blt L(end) + + ALIGN(16) +L(top): vld1.32 {q0}, [up]! + vmvn q2, q2 + subs n, n, #8 + vst1.32 {q2}, [rp:128]! + vld1.32 {q2}, [up]! + vmvn q0, q0 + vst1.32 {q0}, [rp:128]! + bge L(top) + +L(end): vmvn q2, q2 + vst1.32 {q2}, [rp:128]! + +C Handle last 0-7 limbs. Note that rp is aligned after loop, but not when we +C arrive here via L(bc) +L(bc): tst n, #4 + beq L(tl1) + vld1.32 {q0}, [up]! + vmvn q0, q0 + vst1.32 {q0}, [rp]! +L(tl1): tst n, #2 + beq L(tl2) + vld1.32 {d0}, [up]! + vmvn d0, d0 + vst1.32 {d0}, [rp]! +L(tl2): tst n, #1 + beq L(tl3) + vld1.32 {d0[0]}, [up] + vmvn d0, d0 + vst1.32 {d0[0]}, [rp] +L(tl3): bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm new file mode 100644 index 0000000..98fe535 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm @@ -0,0 +1,110 @@ +dnl ARM Neon mpn_copyd optimised for A15. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.75 slower than core register code +C Cortex-A15 0.52 + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_copyd) + add rp, rp, n, lsl #2 + add up, up, n, lsl #2 + + cmp n, #7 + ble L(bc) + +C Copy until rp is 128-bit aligned + tst rp, #4 + beq L(al1) + sub up, up, #4 + vld1.32 {d22[0]}, [up] + sub n, n, #1 + sub rp, rp, #4 + vst1.32 {d22[0]}, [rp] +L(al1): tst rp, #8 + beq L(al2) + sub up, up, #8 + vld1.32 {d22}, [up] + sub n, n, #2 + sub rp, rp, #8 + vst1.32 {d22}, [rp:64] +L(al2): sub up, up, #16 + vld1.32 {d26-d27}, [up] + subs n, n, #12 + sub rp, rp, #16 C offset rp for loop + blt L(end) + + sub up, up, #16 C offset up for loop + mov r12, #-16 + + ALIGN(16) +L(top): vld1.32 {d22-d23}, [up], r12 + vst1.32 {d26-d27}, [rp:128], r12 + vld1.32 {d26-d27}, [up], r12 + vst1.32 {d22-d23}, [rp:128], r12 + subs n, n, #8 + bge L(top) + + add up, up, #16 C undo up offset + C rp offset undoing folded +L(end): vst1.32 {d26-d27}, [rp:128] + +C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we +C arrive here via L(bc) +L(bc): tst n, #4 + beq L(tl1) + sub up, up, #16 + vld1.32 {d22-d23}, [up] + sub rp, rp, #16 + vst1.32 {d22-d23}, [rp] +L(tl1): tst n, #2 + beq L(tl2) + sub up, up, #8 + vld1.32 {d22}, [up] + sub rp, rp, #8 + vst1.32 {d22}, [rp] +L(tl2): tst n, #1 + beq L(tl3) + sub up, up, #4 + vld1.32 {d22[0]}, [up] + sub rp, rp, #4 + vst1.32 {d22[0]}, [rp] +L(tl3): bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm new file mode 100644 index 0000000..2e05afe --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm @@ -0,0 +1,90 @@ +dnl ARM Neon mpn_copyi optimised for A15. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.75 slower than core register code +C Cortex-A15 0.52 + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_copyi) + cmp n, #7 + ble L(bc) + +C Copy until rp is 128-bit aligned + tst rp, #4 + beq L(al1) + vld1.32 {d22[0]}, [up]! + sub n, n, #1 + vst1.32 {d22[0]}, [rp]! +L(al1): tst rp, #8 + beq L(al2) + vld1.32 {d22}, [up]! + sub n, n, #2 + vst1.32 {d22}, [rp:64]! +L(al2): vld1.32 {d26-d27}, [up]! + subs n, n, #12 + blt L(end) + + ALIGN(16) +L(top): vld1.32 {d22-d23}, [up]! + vst1.32 {d26-d27}, [rp:128]! + vld1.32 {d26-d27}, [up]! + vst1.32 {d22-d23}, [rp:128]! + subs n, n, #8 + bge L(top) + +L(end): vst1.32 {d26-d27}, [rp:128]! + +C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we +C arrive here via L(bc) +L(bc): tst n, #4 + beq L(tl1) + vld1.32 {d22-d23}, [up]! + vst1.32 {d22-d23}, [rp]! +L(tl1): tst n, #2 + beq L(tl2) + vld1.32 {d22}, [up]! + vst1.32 {d22}, [rp]! +L(tl2): tst n, #1 + beq L(tl3) + vld1.32 {d22[0]}, [up] + vst1.32 {d22[0]}, [rp] +L(tl3): bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm new file mode 100644 index 0000000..2c11d6d --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm @@ -0,0 +1,177 @@ +dnl ARM Neon mpn_rsh1add_n, mpn_rsh1sub_n. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 4-5 +C Cortex-A15 2.5 + +C TODO +C * Try to make this smaller, its size (384 bytes) is excessive. +C * Try to reach 2.25 c/l on A15, to match the addlsh_1 family. +C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps +C insufficiently for A7 and A8. + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +ifdef(`OPERATION_rsh1add_n', ` + define(`ADDSUBS', `adds $1, $2, $3') + define(`ADCSBCS', `adcs $1, $2, $3') + define(`IFADD', `$1') + define(`IFSUB', `') + define(`func', mpn_rsh1add_n)') +ifdef(`OPERATION_rsh1sub_n', ` + define(`ADDSUBS', `subs $1, $2, $3') + define(`ADCSBCS', `sbcs $1, $2, $3') + define(`IFADD', `') + define(`IFSUB', `$1') + define(`func', mpn_rsh1sub_n)') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) + +ASM_START() +PROLOGUE(func) + push {r4-r10} + + ands r4, n, #3 + beq L(b00) + cmp r4, #2 + blo L(b01) + beq L(b10) + +L(b11): ldmia up!, {r9,r10,r12} + ldmia vp!, {r5,r6,r7} + ADDSUBS( r9, r9, r5) + vmov d4, r9, r9 + ADCSBCS( r10, r10, r6) + ADCSBCS( r12, r12, r7) + vshr.u64 d3, d4, #1 + vmov d1, r10, r12 + vsli.u64 d3, d1, #31 + vshr.u64 d2, d1, #1 + vst1.32 d3[0], [rp]! + bics n, n, #3 + beq L(wd2) +L(gt3): ldmia up!, {r8,r9,r10,r12} + ldmia vp!, {r4,r5,r6,r7} + b L(mi0) + +L(b10): ldmia up!, {r10,r12} + ldmia vp!, {r6,r7} + ADDSUBS( r10, r10, r6) + ADCSBCS( r12, r12, r7) + vmov d4, r10, r12 + bics n, n, #2 + vshr.u64 d2, d4, #1 + beq L(wd2) +L(gt2): ldmia up!, {r8,r9,r10,r12} + ldmia vp!, {r4,r5,r6,r7} + b L(mi0) + +L(b01): ldr r12, [up], #4 + ldr r7, [vp], #4 + ADDSUBS( r12, r12, r7) + vmov d4, r12, r12 + bics n, n, #1 + bne L(gt1) + mov r5, r12, lsr #1 +IFADD(` adc r1, n, #0') +IFSUB(` adc r1, n, #1') + bfi r5, r1, #31, #1 + str r5, [rp] + and r0, r12, #1 + pop {r4-r10} + bx r14 +L(gt1): ldmia up!, {r8,r9,r10,r12} + ldmia vp!, {r4,r5,r6,r7} + vshr.u64 d2, d4, #1 + ADCSBCS( r8, r8, r4) + ADCSBCS( r9, r9, r5) + vmov d0, r8, r9 + ADCSBCS( r10, r10, r6) + ADCSBCS( r12, r12, r7) + vsli.u64 d2, d0, #31 + vshr.u64 d3, d0, #1 + vst1.32 d2[0], [rp]! + b L(mi1) + +L(b00): ldmia up!, {r8,r9,r10,r12} + ldmia vp!, {r4,r5,r6,r7} + ADDSUBS( r8, r8, r4) + ADCSBCS( r9, r9, r5) + vmov d4, r8, r9 + ADCSBCS( r10, r10, r6) + ADCSBCS( r12, r12, r7) + vshr.u64 d3, d4, #1 + b L(mi1) + + ALIGN(16) +L(top): ldmia up!, {r8,r9,r10,r12} + ldmia vp!, {r4,r5,r6,r7} + vsli.u64 d3, d1, #63 + vshr.u64 d2, d1, #1 + vst1.32 d3, [rp]! +L(mi0): ADCSBCS( r8, r8, r4) + ADCSBCS( r9, r9, r5) + vmov d0, r8, r9 + ADCSBCS( r10, r10, r6) + ADCSBCS( r12, r12, r7) + vsli.u64 d2, d0, #63 + vshr.u64 d3, d0, #1 + vst1.32 d2, [rp]! +L(mi1): vmov d1, r10, r12 + sub n, n, #4 + tst n, n + bne L(top) + +L(end): vsli.u64 d3, d1, #63 + vshr.u64 d2, d1, #1 + vst1.32 d3, [rp]! +L(wd2): vmov r4, r5, d2 +IFADD(` adc r1, n, #0') +IFSUB(` adc r1, n, #1') + bfi r5, r1, #31, #1 + stm rp, {r4,r5} + +L(rtn): vmov.32 r0, d4[0] + and r0, r0, #1 + pop {r4-r10} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/submul_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/submul_1.asm new file mode 100644 index 0000000..ed7bfe8 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/submul_1.asm @@ -0,0 +1,159 @@ +dnl ARM mpn_submul_1 optimised for A15. + +dnl Copyright 2012, 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb best +C StrongARM: - +C XScale ? +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 5.75 3.75 +C Cortex-A15 2.32 this + +C This code uses umlal and umaal for adding in the rp[] data, keeping the +C recurrency path separate from any multiply instructions. It performs well on +C A15, but not quite at the multiply bandwidth like the corresponding addmul_1 +C code. +C +C We don't use r12 due to ldrd and strd limitations. +C +C This loop complements U on the fly, +C U' = B^n - 1 - U +C and then uses that +C R - U*v = R + U'*v + v - B^n v + +C Architecture requirements: +C v5 - +C v5t - +C v5te ldrd strd +C v6 umaal +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`v0', `r3') + +define(`w0', `r10') define(`w1', `r11') +define(`u0', `r8') define(`u1', `r9') + +ASM_START() +PROLOGUE(mpn_submul_1) + sub sp, sp, #32 + strd r10, r11, [sp, #24] + strd r8, r9, [sp, #16] + strd r6, r7, [sp, #8] + strd r4, r5, [sp, #0] +C push { r4-r11 } + + ands r6, n, #3 + sub n, n, #3 + beq L(b00) + cmp r6, #2 + bcc L(b01) + beq L(b10) + +L(b11): mov r6, #0 + ldr u1, [up], #-4 + ldr w1, [rp], #-16 + mvn u1, u1 + adds r7, v0, #0 + b L(mid) + +L(b00): ldrd u0, u1, [up] + ldrd w0, w1, [rp], #-12 + mvn u0, u0 + mvn u1, u1 + mov r6, v0 + umaal w0, r6, u0, v0 + cmn r13, #0 C carry clear + mov r7, #0 + str w0, [rp, #12] + b L(mid) + +L(b10): ldrd u0, u1, [up], #8 + ldrd w0, w1, [rp] + mvn u0, u0 + mvn u1, u1 + mov r4, v0 + umaal w0, r4, u0, v0 + mov r5, #0 + str w0, [rp], #-4 + umlal w1, r5, u1, v0 + adds n, n, #0 + bmi L(end) + b L(top) + +L(b01): ldr u1, [up], #4 + ldr w1, [rp], #-8 + mvn u1, u1 + mov r5, v0 + mov r4, #0 + umaal w1, r5, u1, v0 + tst n, n + bmi L(end) + +C ALIGN(16) +L(top): ldrd u0, u1, [up, #0] + adcs r4, r4, w1 + mvn u0, u0 + ldrd w0, w1, [rp, #12] + mvn u1, u1 + mov r6, #0 + umlal w0, r6, u0, v0 C 1 2 + adcs r5, r5, w0 + mov r7, #0 + strd r4, r5, [rp, #8] +L(mid): umaal w1, r7, u1, v0 C 2 3 + ldrd u0, u1, [up, #8] + add up, up, #16 + adcs r6, r6, w1 + mvn u0, u0 + ldrd w0, w1, [rp, #20] + mvn u1, u1 + mov r4, #0 + umlal w0, r4, u0, v0 C 3 4 + adcs r7, r7, w0 + mov r5, #0 + strd r6, r7, [rp, #16]! + sub n, n, #4 + umlal w1, r5, u1, v0 C 0 1 + tst n, n + bpl L(top) + +L(end): adcs r4, r4, w1 + str r4, [rp, #8] + adc r0, r5, #0 + sub r0, v0, r0 + pop { r4-r11 } + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora17/addmul_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora17/addmul_1.asm new file mode 100644 index 0000000..c11ed47 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora17/addmul_1.asm @@ -0,0 +1,34 @@ +dnl ARM mpn_addmul_1 + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_addmul_1) +include_mpn(`arm/v6/addmul_1.asm') diff --git a/gmp-6.3.0/mpn/arm/v7a/cora17/gmp-mparam.h b/gmp-6.3.0/mpn/arm/v7a/cora17/gmp-mparam.h new file mode 100644 index 0000000..143d4bc --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora17/gmp-mparam.h @@ -0,0 +1,233 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1800 MHz Cortex-A17 with Neon (in spite of file position) */ +/* FFT tuning limit = 51243975 */ +/* Generated by tuneup.c, 2019-10-29, gcc 6.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 3 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 54.08% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 45 + +#define DIV_1_VS_MUL_1_PERCENT 248 + +#define MUL_TOOM22_THRESHOLD 38 +#define MUL_TOOM33_THRESHOLD 132 +#define MUL_TOOM44_THRESHOLD 200 +#define MUL_TOOM6H_THRESHOLD 303 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 137 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 179 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 132 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 191 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 62 +#define SQR_TOOM3_THRESHOLD 189 +#define SQR_TOOM4_THRESHOLD 354 +#define SQR_TOOM6_THRESHOLD 426 +#define SQR_TOOM8_THRESHOLD 608 + +#define MULMID_TOOM42_THRESHOLD 62 + +#define MULMOD_BNM1_THRESHOLD 21 +#define SQRMOD_BNM1_THRESHOLD 29 + +#define MUL_FFT_MODF_THRESHOLD 595 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 595, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \ + { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \ + { 27, 7}, { 55, 9}, { 15, 8}, { 31, 7}, \ + { 63, 8}, { 43, 9}, { 23, 8}, { 55, 9}, \ + { 31, 8}, { 63, 9}, { 39, 8}, { 83, 9}, \ + { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 103,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \ + { 95, 9}, { 191,10}, { 111,11}, { 63,10}, \ + { 143, 8}, { 575,10}, { 159,11}, { 95,10}, \ + { 191, 9}, { 383, 8}, { 767, 9}, { 399, 8}, \ + { 799,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511, 8}, { 1023, 9}, { 543, 8}, { 1087, 9}, \ + { 575,10}, { 303,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351, 9}, \ + { 703,10}, { 367, 9}, { 735,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 399, 9}, { 799,10}, \ + { 415, 9}, { 831,10}, { 431, 9}, { 863,11}, \ + { 223,10}, { 447,12}, { 127,10}, { 511, 9}, \ + { 1023,10}, { 543, 9}, { 1087,10}, { 607, 9}, \ + { 1215,11}, { 319,10}, { 671, 9}, { 1343,11}, \ + { 351,10}, { 735,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 863,11}, { 447,10}, \ + { 895,13}, { 127,11}, { 511,10}, { 1023,11}, \ + { 543,10}, { 1087,11}, { 607,10}, { 1215,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,10}, \ + { 1471,12}, { 383,11}, { 799,10}, { 1599,11}, \ + { 863,10}, { 1727,12}, { 447,11}, { 991,10}, \ + { 1983,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 959,11}, \ + { 1983,13}, { 511,12}, { 1087,11}, { 2239,12}, \ + { 1215,11}, { 2431,13}, { 639,12}, { 1471,11}, \ + { 2943,13}, { 767,12}, { 1727,13}, { 895,12}, \ + { 1983,14}, { 511,13}, { 1023,12}, { 2239,13}, \ + { 1151,12}, { 2495,13}, { 1279,12}, { 2623,13}, \ + { 1407,12}, { 2943,14}, { 767,13}, { 1535,12}, \ + { 3135,13}, { 1663,12}, { 3455,13}, { 1919,12}, \ + { 3839,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2943,12}, \ + { 5887,14}, { 1535,13}, { 3455,14}, { 1791,13}, \ + { 3967,15}, { 1023,14}, { 2047,13}, { 4479,14}, \ + { 2303,13}, { 4991,12}, { 9983,14}, { 2559,13}, \ + { 5247,14}, { 2815,13}, { 5887,15}, { 1535,14}, \ + { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 194 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 500 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 500, 5}, { 29, 6}, { 15, 5}, { 31, 6}, \ + { 16, 5}, { 33, 6}, { 29, 7}, { 15, 6}, \ + { 32, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 29, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 41, 8}, { 23, 7}, { 49, 8}, \ + { 27, 9}, { 15, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 95,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 79, 9}, { 159,10}, \ + { 95, 9}, { 191,10}, { 111,11}, { 63,10}, \ + { 127, 9}, { 255,10}, { 143, 9}, { 287,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383, 8}, { 767, 9}, { 399,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,10}, { 287, 9}, { 575,10}, { 303,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,10}, { 351, 9}, { 703,10}, { 367, 9}, \ + { 735,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799,10}, { 415, 9}, { 831,10}, \ + { 431, 9}, { 863,10}, { 447,12}, { 127,11}, \ + { 255,10}, { 511, 9}, { 1023,10}, { 543, 9}, \ + { 1087,11}, { 287,10}, { 607, 9}, { 1215,11}, \ + { 319,10}, { 671,11}, { 351,10}, { 735,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 863,11}, { 447,10}, { 895,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,11}, { 607,10}, { 1215,12}, { 319,11}, \ + { 671,10}, { 1343,11}, { 735,10}, { 1471,12}, \ + { 383,11}, { 799,10}, { 1599,11}, { 863,12}, \ + { 447,11}, { 959,10}, { 1919,11}, { 991,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,10}, { 2431,12}, { 639,11}, { 1343,12}, \ + { 703,11}, { 1471,13}, { 383,12}, { 767,11}, \ + { 1599,12}, { 831,11}, { 1727,12}, { 959,11}, \ + { 1919,14}, { 255,13}, { 511,12}, { 1087,11}, \ + { 2239,12}, { 1215,11}, { 2431,13}, { 639,12}, \ + { 1471,11}, { 2943,13}, { 767,12}, { 1727,13}, \ + { 895,12}, { 1983,14}, { 511,13}, { 1023,12}, \ + { 2239,13}, { 1151,12}, { 2495,13}, { 1279,12}, \ + { 2623,13}, { 1407,12}, { 2943,14}, { 767,13}, \ + { 1535,12}, { 3071,13}, { 1663,12}, { 3455,13}, \ + { 1919,12}, { 3839,15}, { 511,14}, { 1023,13}, \ + { 2175,12}, { 4479,13}, { 2431,14}, { 1279,13}, \ + { 2943,12}, { 5887,14}, { 1535,13}, { 3455,14}, \ + { 1791,13}, { 3967,15}, { 1023,14}, { 2047,13}, \ + { 4479,14}, { 2303,13}, { 4991,12}, { 9983,14}, \ + { 2559,13}, { 5119,14}, { 2815,13}, { 5887,15}, \ + { 1535,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 199 +#define SQR_FFT_THRESHOLD 4736 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 27 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 0 /* always */ +#define SQRLO_DC_THRESHOLD 26 +#define SQRLO_SQR_THRESHOLD 8907 + +#define DC_DIV_QR_THRESHOLD 38 +#define DC_DIVAPPR_Q_THRESHOLD 103 +#define DC_BDIV_QR_THRESHOLD 44 +#define DC_BDIV_Q_THRESHOLD 98 + +#define INV_MULMOD_BNM1_THRESHOLD 78 +#define INV_NEWTON_THRESHOLD 165 +#define INV_APPR_THRESHOLD 115 + +#define BINV_NEWTON_THRESHOLD 296 +#define REDC_1_TO_REDC_2_THRESHOLD 2 +#define REDC_2_TO_REDC_N_THRESHOLD 147 + +#define MU_DIV_QR_THRESHOLD 2089 +#define MU_DIVAPPR_Q_THRESHOLD 2089 +#define MUPI_DIV_QR_THRESHOLD 70 +#define MU_BDIV_QR_THRESHOLD 1718 +#define MU_BDIV_Q_THRESHOLD 2089 + +#define POWM_SEC_TABLE 7,19,107,480,1486 + +#define GET_STR_DC_THRESHOLD 14 +#define GET_STR_PRECOMPUTE_THRESHOLD 29 +#define SET_STR_DC_THRESHOLD 126 +#define SET_STR_PRECOMPUTE_THRESHOLD 541 + +#define FAC_DSC_THRESHOLD 132 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 30 +#define HGCD2_DIV1_METHOD 1 /* 6.55% faster than 3 */ +#define HGCD_THRESHOLD 54 +#define HGCD_APPR_THRESHOLD 52 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 303 +#define GCDEXT_DC_THRESHOLD 225 +#define JACOBI_BASE_METHOD 4 /* 9.73% faster than 1 */ + +/* Tuneup completed successfully, took 111418 seconds */ diff --git a/gmp-6.3.0/mpn/arm/v7a/cora17/mod_34lsub1.asm b/gmp-6.3.0/mpn/arm/v7a/cora17/mod_34lsub1.asm new file mode 100644 index 0000000..39e5a15 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora17/mod_34lsub1.asm @@ -0,0 +1,121 @@ +dnl ARM mpn_mod_34lsub1 -- remainder modulo 2^24-1. + +dnl Copyright 2012, 2013, 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A5 2.67 +C Cortex-A7 2.37 +C Cortex-A8 2.34 +C Cortex-A9 ? +C Cortex-A15 1.39 +C Cortex-A17 1.60 +C Cortex-A53 2.51 + +define(`ap', r0) +define(`n', r1) + +C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) + +C TODO +C * Write cleverer summation code. +C * Consider loading 6 64-bit aligned registers at a time, to approach 1 c/l. + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_mod_34lsub1) + push { r4, r5, r6, r7 } + + subs n, n, #3 + mov r7, #0 + blt L(le2) C n <= 2 + + ldmia ap!, { r2, r3, r12 } + subs n, n, #3 + blt L(sum) C n <= 5 + mov r7, #0 + b L(mid) + +L(top): adds r2, r2, r4 + adcs r3, r3, r5 + adcs r12, r12, r6 + adc r7, r7, #0 +L(mid): ldmia ap!, { r4, r5, r6 } + subs n, n, #3 + bpl L(top) + + adds r2, r2, r4 + adcs r3, r3, r5 + adcs r12, r12, r6 + adc r7, r7, #0 C r7 <= 1 + +L(sum): cmn n, #2 + movlo r4, #0 + ldrhs r4, [ap], #4 + movls r5, #0 + ldrhi r5, [ap], #4 + + adds r2, r2, r4 + adcs r3, r3, r5 + adcs r12, r12, #0 + adc r7, r7, #0 C r7 <= 2 + +L(sum2): + bic r0, r2, #0xff000000 + add r0, r0, r2, lsr #24 + add r0, r0, r7 + + mov r7, r3, lsl #8 + bic r2, r7, #0xff000000 + add r0, r0, r2 + add r0, r0, r3, lsr #16 + + mov r2, r12, lsl #16 + bic r1, r2, #0xff000000 + add r0, r0, r1 + add r0, r0, r12, lsr #8 + + pop { r4, r5, r6, r7 } + return lr + +L(le2): cmn n, #1 + bne L(1) + ldmia ap!, { r2, r3 } + mov r12, #0 + b L(sum2) +L(1): ldr r2, [ap] + bic r0, r2, #0xff000000 + add r0, r0, r2, lsr #24 + pop { r4, r5, r6, r7 } + return lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora17/mul_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora17/mul_1.asm new file mode 100644 index 0000000..d9b6042 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora17/mul_1.asm @@ -0,0 +1,34 @@ +dnl ARM mpn_mul_1 + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_mul_1) +include_mpn(`arm/v6/mul_1.asm') diff --git a/gmp-6.3.0/mpn/arm/v7a/cora17/submul_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora17/submul_1.asm new file mode 100644 index 0000000..f3e8139 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora17/submul_1.asm @@ -0,0 +1,34 @@ +dnl ARM mpn_submul_1 + +dnl Copyright 2018 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_submul_1) +include_mpn(`arm/v6/submul_1.asm') diff --git a/gmp-6.3.0/mpn/arm/v7a/cora5/gmp-mparam.h b/gmp-6.3.0/mpn/arm/v7a/cora5/gmp-mparam.h new file mode 100644 index 0000000..e3564e0 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora5/gmp-mparam.h @@ -0,0 +1,205 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1500 MHz Cortex-A5 (odroid c1) */ +/* FFT tuning limit = 18,235,562 */ +/* Generated by tuneup.c, 2019-10-22, gcc 4.9 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 23 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 132.79% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 52 + +#define DIV_1_VS_MUL_1_PERCENT 213 + +#define MUL_TOOM22_THRESHOLD 48 +#define MUL_TOOM33_THRESHOLD 143 +#define MUL_TOOM44_THRESHOLD 262 +#define MUL_TOOM6H_THRESHOLD 414 +#define MUL_TOOM8H_THRESHOLD 527 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 153 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 168 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 152 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 180 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 226 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 66 +#define SQR_TOOM3_THRESHOLD 149 +#define SQR_TOOM4_THRESHOLD 348 +#define SQR_TOOM6_THRESHOLD 517 +#define SQR_TOOM8_THRESHOLD 608 + +#define MULMID_TOOM42_THRESHOLD 70 + +#define MULMOD_BNM1_THRESHOLD 26 +#define SQRMOD_BNM1_THRESHOLD 28 + +#define MUL_FFT_MODF_THRESHOLD 660 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 660, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \ + { 17, 5}, { 35, 6}, { 29, 7}, { 15, 6}, \ + { 37, 7}, { 19, 6}, { 40, 7}, { 21, 6}, \ + { 43, 7}, { 37, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 51, 8}, { 27, 7}, { 55, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 71, 9}, { 39, 8}, \ + { 83, 9}, { 47, 8}, { 99, 9}, { 55,10}, \ + { 31, 9}, { 63, 8}, { 127, 9}, { 79,10}, \ + { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 167,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 159,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799,10}, { 415,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 511, 9}, { 1023,10}, \ + { 543,11}, { 287,10}, { 607,11}, { 319,10}, \ + { 671,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 831,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,11}, { 575,10}, { 1151,11}, { 607,12}, \ + { 319,11}, { 703,12}, { 383,11}, { 831,12}, \ + { 447,11}, { 895,13}, { 255,12}, { 511,11}, \ + { 1087,12}, { 575,11}, { 1183,12}, { 639,11}, \ + { 1279,12}, { 703,13}, { 383,12}, { 767,11}, \ + { 1535,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1151,13}, { 639,12}, { 1407,13}, { 767,12}, \ + { 1599,13}, { 895,12}, { 1791,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2367,13}, \ + { 1279,12}, { 2559,13}, { 1407,14}, { 767,13}, \ + { 1535,12}, { 3071,13}, { 1663,12}, { 3327,13}, \ + { 1791,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4351,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 140 +#define MUL_FFT_THRESHOLD 7552 + +#define SQR_FFT_MODF_THRESHOLD 590 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 590, 5}, { 33, 6}, { 17, 5}, { 35, 6}, \ + { 36, 7}, { 19, 6}, { 40, 7}, { 21, 6}, \ + { 43, 7}, { 23, 6}, { 47, 7}, { 37, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 49, 8}, \ + { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 167,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 159,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511,10}, { 271, 9}, \ + { 543,11}, { 159,10}, { 319, 9}, { 639,10}, \ + { 335, 9}, { 671,10}, { 351,11}, { 191,10}, \ + { 383, 9}, { 767,10}, { 415,12}, { 127,11}, \ + { 255,10}, { 511, 9}, { 1023,10}, { 543, 9}, \ + { 1087,11}, { 287,10}, { 575, 9}, { 1151,10}, \ + { 607,11}, { 319,10}, { 671,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 831,13}, { 127,12}, { 255,11}, { 511,10}, \ + { 1023,11}, { 543,10}, { 1087,11}, { 575,10}, \ + { 1151,11}, { 607,12}, { 319,11}, { 735,12}, \ + { 383,11}, { 831,12}, { 447,11}, { 927,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 639,11}, { 1279,12}, { 703,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 831,11}, \ + { 1663,12}, { 895,11}, { 1791,12}, { 959,14}, \ + { 255,13}, { 511,12}, { 1023,11}, { 2047,12}, \ + { 1151,13}, { 639,12}, { 1407,13}, { 767,12}, \ + { 1599,13}, { 895,12}, { 1791,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2367,13}, \ + { 1279,12}, { 2559,13}, { 1407,14}, { 767,13}, \ + { 1535,12}, { 3071,13}, { 1663,12}, { 3327,13}, \ + { 1791,15}, { 511,14}, { 1023,13}, { 2175,12}, \ + { 4351,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 144 +#define SQR_FFT_THRESHOLD 5760 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 39 +#define MULLO_MUL_N_THRESHOLD 14709 +#define SQRLO_BASECASE_THRESHOLD 8 +#define SQRLO_DC_THRESHOLD 33 +#define SQRLO_SQR_THRESHOLD 11278 + +#define DC_DIV_QR_THRESHOLD 36 +#define DC_DIVAPPR_Q_THRESHOLD 116 +#define DC_BDIV_QR_THRESHOLD 48 +#define DC_BDIV_Q_THRESHOLD 140 + +#define INV_MULMOD_BNM1_THRESHOLD 95 +#define INV_NEWTON_THRESHOLD 181 +#define INV_APPR_THRESHOLD 125 + +#define BINV_NEWTON_THRESHOLD 327 +#define REDC_1_TO_REDC_2_THRESHOLD 0 /* always */ +#define REDC_2_TO_REDC_N_THRESHOLD 152 + +#define MU_DIV_QR_THRESHOLD 2350 +#define MU_DIVAPPR_Q_THRESHOLD 2130 +#define MUPI_DIV_QR_THRESHOLD 98 +#define MU_BDIV_QR_THRESHOLD 1970 +#define MU_BDIV_Q_THRESHOLD 2172 + +#define POWM_SEC_TABLE 6,37,108,624,2351 + +#define GET_STR_DC_THRESHOLD 28 +#define GET_STR_PRECOMPUTE_THRESHOLD 44 +#define SET_STR_DC_THRESHOLD 309 +#define SET_STR_PRECOMPUTE_THRESHOLD 762 + +#define FAC_DSC_THRESHOLD 236 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 25 +#define HGCD2_DIV1_METHOD 5 /* 2.92% faster than 3 */ +#define HGCD_THRESHOLD 70 +#define HGCD_APPR_THRESHOLD 59 +#define HGCD_REDUCE_THRESHOLD 4120 +#define GCD_DC_THRESHOLD 229 +#define GCDEXT_DC_THRESHOLD 233 +#define JACOBI_BASE_METHOD 1 /* 17.07% faster than 4 */ + +/* Tuneup completed successfully, took 47845 seconds */ diff --git a/gmp-6.3.0/mpn/arm/v7a/cora7/gmp-mparam.h b/gmp-6.3.0/mpn/arm/v7a/cora7/gmp-mparam.h new file mode 100644 index 0000000..78de045 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora7/gmp-mparam.h @@ -0,0 +1,202 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 900 MHz Cortex-A7 (raspberry pi2) */ +/* FFT tuning limit = 21,559,921 */ +/* Generated by tuneup.c, 2019-10-22, gcc 8.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 7 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 8 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 18 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 64.16% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 48 + +#define DIV_1_VS_MUL_1_PERCENT 216 + +#define MUL_TOOM22_THRESHOLD 39 +#define MUL_TOOM33_THRESHOLD 129 +#define MUL_TOOM44_THRESHOLD 196 +#define MUL_TOOM6H_THRESHOLD 327 +#define MUL_TOOM8H_THRESHOLD 478 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 129 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 183 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 132 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 144 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 190 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 52 +#define SQR_TOOM3_THRESHOLD 162 +#define SQR_TOOM4_THRESHOLD 268 +#define SQR_TOOM6_THRESHOLD 399 +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 50 + +#define MULMOD_BNM1_THRESHOLD 21 +#define SQRMOD_BNM1_THRESHOLD 25 + +#define MUL_FFT_MODF_THRESHOLD 636 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 636, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \ + { 29, 7}, { 15, 6}, { 33, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 43, 8}, \ + { 23, 7}, { 49, 8}, { 27, 7}, { 55, 8}, \ + { 31, 7}, { 63, 8}, { 43, 9}, { 23, 8}, \ + { 55, 9}, { 31, 8}, { 67, 9}, { 39, 8}, \ + { 83, 9}, { 47, 8}, { 95, 9}, { 55,10}, \ + { 31, 9}, { 79,10}, { 47, 9}, { 103,11}, \ + { 31,10}, { 63, 9}, { 135,10}, { 79, 9}, \ + { 159,10}, { 95, 9}, { 191,10}, { 111,11}, \ + { 63,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351,11}, \ + { 191,10}, { 383, 9}, { 767,10}, { 399, 9}, \ + { 799,10}, { 415,11}, { 223,12}, { 127,11}, \ + { 255,10}, { 511, 9}, { 1023,10}, { 543,11}, \ + { 287,10}, { 607,11}, { 319,10}, { 671,11}, \ + { 351,12}, { 191,11}, { 383,10}, { 799,11}, \ + { 415,10}, { 831,13}, { 127,12}, { 255,11}, \ + { 511,10}, { 1023,11}, { 543,10}, { 1087,11}, \ + { 607,12}, { 319,11}, { 735,12}, { 383,11}, \ + { 863,12}, { 447,11}, { 959,13}, { 255,12}, \ + { 511,11}, { 1087,12}, { 575,11}, { 1215,12}, \ + { 639,11}, { 1279,12}, { 703,13}, { 383,12}, \ + { 767,11}, { 1599,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1215,13}, { 639,12}, { 1471,13}, \ + { 767,12}, { 1663,13}, { 895,12}, { 1855,14}, \ + { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \ + { 2431,13}, { 1407,14}, { 767,13}, { 1663,12}, \ + { 3327,13}, { 1791,15}, { 511,14}, { 1023,13}, \ + { 2431,14}, { 1279,13}, { 8192,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 133 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 535 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 535, 5}, { 25, 6}, { 13, 5}, { 28, 6}, \ + { 15, 5}, { 31, 6}, { 29, 7}, { 15, 6}, \ + { 33, 7}, { 17, 6}, { 36, 7}, { 19, 6}, \ + { 39, 7}, { 29, 8}, { 15, 7}, { 37, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 49, 8}, \ + { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 67, 9}, { 39, 8}, { 79, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 143, 9}, \ + { 287,10}, { 159,11}, { 95,10}, { 191,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287,11}, { 159,10}, \ + { 319, 9}, { 639,10}, { 335, 9}, { 671,10}, \ + { 351,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799,10}, { 415, 9}, { 831,11}, \ + { 223,12}, { 127,10}, { 543,11}, { 287,10}, \ + { 607,11}, { 319,10}, { 671,11}, { 351,10}, \ + { 703,12}, { 191,11}, { 383,10}, { 799,11}, \ + { 415,10}, { 831,13}, { 127,11}, { 511,10}, \ + { 1023,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 735,12}, { 383,11}, { 863,12}, \ + { 447,11}, { 991,12}, { 511,11}, { 1087,12}, \ + { 575,11}, { 1215,12}, { 639,11}, { 1279,12}, \ + { 703,13}, { 383,12}, { 767,11}, { 1535,12}, \ + { 831,11}, { 1663,12}, { 959,13}, { 511,12}, \ + { 1215,13}, { 639,12}, { 1471,13}, { 767,12}, \ + { 1663,13}, { 895,12}, { 1855,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 2431,13}, \ + { 1407,14}, { 767,13}, { 1791,15}, { 511,14}, \ + { 1023,13}, { 2431,14}, { 1279,13}, { 8192,14}, \ + { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 134 +#define SQR_FFT_THRESHOLD 4736 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 27 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 5 +#define SQRLO_DC_THRESHOLD 31 +#define SQRLO_SQR_THRESHOLD 9449 + +#define DC_DIV_QR_THRESHOLD 28 +#define DC_DIVAPPR_Q_THRESHOLD 90 +#define DC_BDIV_QR_THRESHOLD 32 +#define DC_BDIV_Q_THRESHOLD 110 + +#define INV_MULMOD_BNM1_THRESHOLD 78 +#define INV_NEWTON_THRESHOLD 134 +#define INV_APPR_THRESHOLD 98 + +#define BINV_NEWTON_THRESHOLD 278 +#define REDC_1_TO_REDC_2_THRESHOLD 4 +#define REDC_2_TO_REDC_N_THRESHOLD 123 + +#define MU_DIV_QR_THRESHOLD 1718 +#define MU_DIVAPPR_Q_THRESHOLD 1685 +#define MUPI_DIV_QR_THRESHOLD 62 +#define MU_BDIV_QR_THRESHOLD 1528 +#define MU_BDIV_Q_THRESHOLD 1718 + +#define POWM_SEC_TABLE 1,22,95,563,1955 + +#define GET_STR_DC_THRESHOLD 28 +#define GET_STR_PRECOMPUTE_THRESHOLD 51 +#define SET_STR_DC_THRESHOLD 182 +#define SET_STR_PRECOMPUTE_THRESHOLD 638 + +#define FAC_DSC_THRESHOLD 153 +#define FAC_ODD_THRESHOLD 56 + +#define MATRIX22_STRASSEN_THRESHOLD 25 +#define HGCD2_DIV1_METHOD 1 /* 5.04% faster than 3 */ +#define HGCD_THRESHOLD 55 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 3389 +#define GCD_DC_THRESHOLD 153 +#define GCDEXT_DC_THRESHOLD 180 +#define JACOBI_BASE_METHOD 1 /* 30.60% faster than 4 */ + +/* Tuneup completed successfully, took 75202 seconds */ diff --git a/gmp-6.3.0/mpn/arm/v7a/cora8/bdiv_q_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora8/bdiv_q_1.asm new file mode 100644 index 0000000..e74b260 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora8/bdiv_q_1.asm @@ -0,0 +1,158 @@ +dnl ARM v6 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. +dnl This is v6 code but it runs well on just the v7a Cortex-A8, A9, and A15. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C norm unorm +C 1176 - - +C Cortex-A5 9 13 +C Cortex-A7 12 18 +C Cortex-A8 13 14 +C Cortex-A9 9 10 not measured since latest edits +C Cortex-A15 7 7 +C Cortex-A53 16 24 + +C Architecture requirements: +C v5 - +C v5t clz +C v5te - +C v6 umaal +C v6t2 - +C v7a - + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') +define(`d', `r3') +define(`di_arg', `sp[0]') C just mpn_pi1_bdiv_q_1 +define(`cnt_arg', `sp[4]') C just mpn_pi1_bdiv_q_1 + +define(`cy', `r7') +define(`cnt', `r6') +define(`tnc', `r4') + +ASM_START() +PROLOGUE(mpn_bdiv_q_1) + push {r6-r11} + + rsb r10, d, #0 + and r10, r10, d + clz r10, r10 + rsbs cnt, r10, #31 C count_trailing_zeros + mov d, d, lsr cnt + +C binvert limb + LEA( r10, binvert_limb_table) + and r12, d, #254 + ldrb r10, [r10, r12, lsr #1] + mul r12, r10, r10 + mul r12, d, r12 + rsb r12, r12, r10, lsl #1 + mul r10, r12, r12 + mul r10, d, r10 + rsb r10, r10, r12, lsl #1 C r10 = inverse + b L(pi1) +EPILOGUE() + +PROLOGUE(mpn_pi1_bdiv_q_1) + push {r6-r11} + + ldr cnt, [sp, #28] + ldr r10, [sp, #24] + cmp cnt, #0 + +L(pi1): ldr r11, [up], #4 C up[0] + mov cy, #0 + rsb r8, r10, #0 C r8 = -inverse + bne L(unorm) + +L(norm): + subs n, n, #1 + mul r11, r11, r10 + beq L(edn) + + ALIGN(16) +L(tpn): ldr r9, [up], #4 + mov r12, #0 + str r11, [rp], #4 + umaal r12, cy, r11, d + mul r11, r9, r10 + mla r11, cy, r8, r11 + subs n, n, #1 + bne L(tpn) + +L(edn): str r11, [rp] + pop {r6-r11} + bx r14 + +L(unorm): + push {r4-r5} + rsb tnc, cnt, #32 + mov r5, r11, lsr cnt + subs n, n, #1 + beq L(ed1) + + ldr r12, [up], #4 + orr r9, r5, r12, lsl tnc + mov r5, r12, lsr cnt + mul r11, r9, r10 + subs n, n, #1 + beq L(edu) + + ALIGN(16) +L(tpu): ldr r12, [up], #4 + orr r9, r5, r12, lsl tnc + mov r5, r12, lsr cnt + mov r12, #0 + str r11, [rp], #4 + umaal r12, cy, r11, d + mul r11, r9, r10 + mla r11, cy, r8, r11 + subs n, n, #1 + bne L(tpu) + +L(edu): str r11, [rp], #4 + mov r12, #0 + umaal r12, cy, r11, d + mul r11, r5, r10 + mla r11, cy, r8, r11 + str r11, [rp] + pop {r4-r11} + bx r14 + +L(ed1): mul r11, r5, r10 + str r11, [rp] + pop {r4-r11} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora8/gmp-mparam.h b/gmp-6.3.0/mpn/arm/v7a/cora8/gmp-mparam.h new file mode 100644 index 0000000..5864841 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora8/gmp-mparam.h @@ -0,0 +1,207 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1000 MHz Cortex-A8 (beaglebone black) */ +/* FFT tuning limit = 9,464,348 */ +/* Generated by tuneup.c, 2019-10-23, gcc 6.3 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD MP_SIZE_T_MAX +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 /* 50.65% faster than 2 */ +#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 31 + +#define DIV_1_VS_MUL_1_PERCENT 192 + +#define MUL_TOOM22_THRESHOLD 39 +#define MUL_TOOM33_THRESHOLD 129 +#define MUL_TOOM44_THRESHOLD 226 +#define MUL_TOOM6H_THRESHOLD 366 +#define MUL_TOOM8H_THRESHOLD 620 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 141 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 183 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 154 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 160 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 193 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 46 +#define SQR_TOOM3_THRESHOLD 145 +#define SQR_TOOM4_THRESHOLD 375 +#define SQR_TOOM6_THRESHOLD 0 /* always */ +#define SQR_TOOM8_THRESHOLD 547 + +#define MULMID_TOOM42_THRESHOLD 38 + +#define MULMOD_BNM1_THRESHOLD 22 +#define SQRMOD_BNM1_THRESHOLD 23 + +#define MUL_FFT_MODF_THRESHOLD 476 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 476, 5}, { 28, 6}, { 15, 5}, { 31, 6}, \ + { 28, 7}, { 15, 6}, { 33, 7}, { 19, 6}, \ + { 39, 7}, { 27, 8}, { 15, 7}, { 35, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \ + { 27, 7}, { 55, 8}, { 31, 7}, { 63, 8}, \ + { 43, 9}, { 23, 8}, { 55, 9}, { 31, 8}, \ + { 71, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \ + { 99, 9}, { 55,10}, { 31, 9}, { 87,10}, \ + { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 167,10}, { 95, 9}, \ + { 199,10}, { 111,11}, { 63,10}, { 127, 9}, \ + { 255,10}, { 143, 9}, { 287, 8}, { 575,10}, \ + { 159, 9}, { 319,11}, { 95,10}, { 191, 9}, \ + { 383, 8}, { 767, 9}, { 399,10}, { 207,12}, \ + { 63,11}, { 127,10}, { 255, 9}, { 511,10}, \ + { 271, 9}, { 543,10}, { 287, 9}, { 575,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,10}, { 351, 9}, { 703,10}, { 367,11}, \ + { 191,10}, { 399, 9}, { 799,10}, { 415,11}, \ + { 223,12}, { 127,11}, { 255,10}, { 543,11}, \ + { 287,10}, { 607, 9}, { 1215,11}, { 319,10}, \ + { 671,11}, { 351,10}, { 703,12}, { 191,11}, \ + { 383,10}, { 799,11}, { 415,10}, { 863,11}, \ + { 447,13}, { 127,12}, { 255,11}, { 543,10}, \ + { 1087,11}, { 607,12}, { 319,11}, { 671,10}, \ + { 1343,11}, { 735,12}, { 383,11}, { 799,10}, \ + { 1599,11}, { 863,12}, { 447,11}, { 959,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,12}, { 639,11}, { 1343,12}, { 703,13}, \ + { 383,12}, { 767,11}, { 1599,12}, { 831,11}, \ + { 1663,12}, { 959,14}, { 255,13}, { 511,12}, \ + { 1215,13}, { 639,12}, { 1407,13}, { 767,12}, \ + { 1663,13}, { 895,12}, { 1791,14}, { 511,13}, \ + { 1023,12}, { 2111,13}, { 1151,12}, { 4096,13}, \ + { 8192,14}, { 16384,15}, { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 139 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 436 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 436, 5}, { 25, 6}, { 13, 5}, { 27, 6}, \ + { 28, 7}, { 15, 6}, { 32, 7}, { 17, 6}, \ + { 35, 7}, { 19, 6}, { 39, 7}, { 29, 8}, \ + { 15, 7}, { 35, 8}, { 19, 7}, { 41, 8}, \ + { 23, 7}, { 49, 8}, { 27, 9}, { 15, 8}, \ + { 43, 9}, { 23, 8}, { 55,10}, { 15, 9}, \ + { 31, 8}, { 67, 9}, { 39, 8}, { 79, 9}, \ + { 47, 8}, { 95, 9}, { 55,10}, { 31, 9}, \ + { 79,10}, { 47, 9}, { 103,11}, { 31,10}, \ + { 63, 9}, { 135,10}, { 79, 9}, { 159, 8}, \ + { 319, 9}, { 167,10}, { 95, 9}, { 191,10}, \ + { 111,11}, { 63,10}, { 127, 9}, { 255, 8}, \ + { 511, 9}, { 271,10}, { 143, 9}, { 287, 8}, \ + { 575, 9}, { 303,10}, { 159, 9}, { 319,11}, \ + { 95,10}, { 191, 9}, { 383, 8}, { 767, 9}, \ + { 399,12}, { 63,11}, { 127,10}, { 255, 9}, \ + { 511,10}, { 271, 9}, { 543,10}, { 287, 9}, \ + { 575,10}, { 303,11}, { 159,10}, { 319, 9}, \ + { 639,10}, { 335, 9}, { 671,10}, { 351, 9}, \ + { 703,10}, { 367,11}, { 191,10}, { 383, 9}, \ + { 767,10}, { 399, 9}, { 799,10}, { 415, 9}, \ + { 831,11}, { 223,10}, { 447,12}, { 127,11}, \ + { 255,10}, { 511, 9}, { 1023,10}, { 543,11}, \ + { 287,10}, { 607,11}, { 319,10}, { 671,11}, \ + { 351,10}, { 735,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 863,11}, { 447,10}, \ + { 895,13}, { 127,12}, { 255,11}, { 511,10}, \ + { 1023,11}, { 543,10}, { 1087,11}, { 607,12}, \ + { 319,11}, { 671,10}, { 1343,11}, { 735,12}, \ + { 383,11}, { 863,12}, { 447,11}, { 959,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,12}, { 639,11}, { 1343,12}, { 703,11}, \ + { 1407,13}, { 383,12}, { 767,11}, { 1599,12}, \ + { 831,11}, { 1663,12}, { 959,14}, { 255,13}, \ + { 511,12}, { 1215,13}, { 639,12}, { 1471,13}, \ + { 767,12}, { 1663,13}, { 895,12}, { 1855,14}, \ + { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \ + { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 152 +#define SQR_FFT_THRESHOLD 3712 + +#define MULLO_BASECASE_THRESHOLD 21 +#define MULLO_DC_THRESHOLD 0 /* never mpn_mullo_basecase */ +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 9 +#define SQRLO_DC_THRESHOLD 17 +#define SQRLO_SQR_THRESHOLD 7246 + +#define DC_DIV_QR_THRESHOLD 27 +#define DC_DIVAPPR_Q_THRESHOLD 74 +#define DC_BDIV_QR_THRESHOLD 21 +#define DC_BDIV_Q_THRESHOLD 64 + +#define INV_MULMOD_BNM1_THRESHOLD 78 +#define INV_NEWTON_THRESHOLD 31 +#define INV_APPR_THRESHOLD 37 + +#define BINV_NEWTON_THRESHOLD 167 +#define REDC_1_TO_REDC_2_THRESHOLD 4 +#define REDC_2_TO_REDC_N_THRESHOLD 198 + +#define MU_DIV_QR_THRESHOLD 1858 +#define MU_DIVAPPR_Q_THRESHOLD 1685 +#define MUPI_DIV_QR_THRESHOLD 43 +#define MU_BDIV_QR_THRESHOLD 1589 +#define MU_BDIV_Q_THRESHOLD 1685 + +#define POWM_SEC_TABLE 1,13,96,487,1378 + +#define GET_STR_DC_THRESHOLD 18 +#define GET_STR_PRECOMPUTE_THRESHOLD 36 +#define SET_STR_DC_THRESHOLD 145 +#define SET_STR_PRECOMPUTE_THRESHOLD 505 + +#define FAC_DSC_THRESHOLD 137 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 24 +#define HGCD2_DIV1_METHOD 5 /* 4.29% faster than 4 */ +#define HGCD_THRESHOLD 39 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 3524 +#define GCD_DC_THRESHOLD 116 +#define GCDEXT_DC_THRESHOLD 124 +#define JACOBI_BASE_METHOD 4 /* 5.89% faster than 1 */ + +/* Tuneup completed successfully, took 48230 seconds */ diff --git a/gmp-6.3.0/mpn/arm/v7a/cora9/bdiv_q_1.asm b/gmp-6.3.0/mpn/arm/v7a/cora9/bdiv_q_1.asm new file mode 100644 index 0000000..245b371 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora9/bdiv_q_1.asm @@ -0,0 +1,36 @@ +dnl ARM mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2012, 2013, 2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) +include_mpn(`arm/v7a/cora8/bdiv_q_1.asm') diff --git a/gmp-6.3.0/mpn/arm/v7a/cora9/gmp-mparam.h b/gmp-6.3.0/mpn/arm/v7a/cora9/gmp-mparam.h new file mode 100644 index 0000000..5c54012 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora9/gmp-mparam.h @@ -0,0 +1,211 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 1999-2003, 2009, 2010, 2012-2015 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define GMP_LIMB_BITS 32 +#define GMP_LIMB_BYTES 4 + +/* 1000 MHz Cortex-A9 */ +/* FFT tuning limit = 25 M */ +/* Generated by tuneup.c, 2014-03-12, gcc 4.6 */ + +#define MOD_1_NORM_THRESHOLD 0 /* always */ +#define MOD_1_UNNORM_THRESHOLD 0 /* always */ +#define MOD_1N_TO_MOD_1_1_THRESHOLD 5 +#define MOD_1U_TO_MOD_1_1_THRESHOLD 4 +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 8 +#define MOD_1_2_TO_MOD_1_4_THRESHOLD MP_SIZE_T_MAX +#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 12 +#define USE_PREINV_DIVREM_1 1 /* native */ +#define DIV_QR_1N_PI1_METHOD 1 +#define DIV_QR_1_NORM_THRESHOLD 5 +#define DIV_QR_1_UNNORM_THRESHOLD 1 +#define DIV_QR_2_PI2_THRESHOLD MP_SIZE_T_MAX /* never */ +#define DIVEXACT_1_THRESHOLD 0 /* always (native) */ +#define BMOD_1_TO_MOD_1_THRESHOLD 20 + +#define DIV_1_VS_MUL_1_PERCENT 190 + +#define MUL_TOOM22_THRESHOLD 45 +#define MUL_TOOM33_THRESHOLD 129 +#define MUL_TOOM44_THRESHOLD 387 +#define MUL_TOOM6H_THRESHOLD 537 +#define MUL_TOOM8H_THRESHOLD 774 + +#define MUL_TOOM32_TO_TOOM43_THRESHOLD 141 +#define MUL_TOOM32_TO_TOOM53_THRESHOLD 237 +#define MUL_TOOM42_TO_TOOM53_THRESHOLD 141 +#define MUL_TOOM42_TO_TOOM63_THRESHOLD 258 +#define MUL_TOOM43_TO_TOOM54_THRESHOLD 211 + +#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ +#define SQR_TOOM2_THRESHOLD 64 +#define SQR_TOOM3_THRESHOLD 189 +#define SQR_TOOM4_THRESHOLD 517 +#define SQR_TOOM6_THRESHOLD 656 +#define SQR_TOOM8_THRESHOLD 0 /* always */ + +#define MULMID_TOOM42_THRESHOLD 62 + +#define MULMOD_BNM1_THRESHOLD 23 +#define SQRMOD_BNM1_THRESHOLD 28 + +#define MUL_FFT_MODF_THRESHOLD 630 /* k = 5 */ +#define MUL_FFT_TABLE3 \ + { { 630, 5}, { 29, 6}, { 15, 5}, { 33, 6}, \ + { 17, 5}, { 35, 6}, { 36, 7}, { 19, 6}, \ + { 40, 7}, { 21, 6}, { 43, 7}, { 23, 6}, \ + { 47, 7}, { 25, 6}, { 51, 7}, { 27, 6}, \ + { 55, 7}, { 29, 8}, { 15, 7}, { 37, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \ + { 27, 7}, { 57, 9}, { 15, 8}, { 31, 7}, \ + { 65, 8}, { 35, 7}, { 71, 8}, { 43, 9}, \ + { 23, 8}, { 55, 9}, { 31, 8}, { 71, 9}, \ + { 39, 8}, { 83, 9}, { 47, 8}, { 99, 9}, \ + { 55,10}, { 31, 9}, { 79,10}, { 47, 9}, \ + { 103,11}, { 31,10}, { 63, 9}, { 135,10}, \ + { 79, 9}, { 167,10}, { 95, 9}, { 191,10}, \ + { 111,11}, { 63,10}, { 159,11}, { 95,10}, \ + { 191, 9}, { 383,12}, { 63,11}, { 127,10}, \ + { 255, 9}, { 511,10}, { 271, 9}, { 543,11}, \ + { 159,10}, { 319, 9}, { 639,10}, { 335, 9}, \ + { 671,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799,10}, { 415,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 511, 9}, { 1023,10}, \ + { 543,11}, { 287,10}, { 607,11}, { 319,10}, \ + { 671,11}, { 351,12}, { 191,11}, { 383,10}, \ + { 799,11}, { 415,10}, { 831,13}, { 127,12}, \ + { 255,11}, { 511,10}, { 1023,11}, { 543,10}, \ + { 1087,11}, { 607,12}, { 319,11}, { 735,12}, \ + { 383,11}, { 831,12}, { 447,11}, { 927,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1215,12}, { 639,11}, { 1343,12}, { 703,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 831,11}, \ + { 1663,12}, { 895,14}, { 255,13}, { 511,12}, \ + { 1023,11}, { 2047,12}, { 1151,13}, { 639,12}, \ + { 1407,13}, { 767,12}, { 1663,13}, { 895,12}, \ + { 1791,14}, { 511,13}, { 1023,12}, { 2111,13}, \ + { 1151,12}, { 2431,13}, { 1279,12}, { 2559,13}, \ + { 1407,14}, { 767,13}, { 1535,12}, { 3071,13}, \ + { 1663,12}, { 3455,13}, { 1791,15}, { 511,14}, \ + { 1023,13}, { 2047,12}, { 4095,13}, { 2175,12}, \ + { 4479,13}, { 2431,14}, { 1279,13}, { 2559,12}, \ + { 5119,13}, { 2815,12}, { 5631,14}, { 16384,15}, \ + { 32768,16} } +#define MUL_FFT_TABLE3_SIZE 157 +#define MUL_FFT_THRESHOLD 6784 + +#define SQR_FFT_MODF_THRESHOLD 565 /* k = 5 */ +#define SQR_FFT_TABLE3 \ + { { 565, 5}, { 19, 4}, { 40, 5}, { 21, 4}, \ + { 43, 5}, { 28, 6}, { 15, 5}, { 35, 6}, \ + { 29, 7}, { 15, 6}, { 37, 7}, { 19, 6}, \ + { 39, 7}, { 21, 6}, { 43, 7}, { 23, 6}, \ + { 47, 7}, { 29, 8}, { 15, 7}, { 37, 8}, \ + { 19, 7}, { 43, 8}, { 23, 7}, { 51, 8}, \ + { 27, 7}, { 55, 9}, { 15, 8}, { 31, 7}, \ + { 65, 8}, { 35, 7}, { 71, 8}, { 43, 9}, \ + { 23, 8}, { 55,10}, { 15, 9}, { 31, 8}, \ + { 71, 9}, { 39, 8}, { 83, 9}, { 47, 8}, \ + { 95, 9}, { 55,10}, { 31, 9}, { 79,10}, \ + { 47, 9}, { 103,11}, { 31,10}, { 63, 9}, \ + { 135,10}, { 79, 9}, { 159,10}, { 95, 9}, \ + { 191,10}, { 111,11}, { 63,10}, { 159,11}, \ + { 95,10}, { 191, 9}, { 383,12}, { 63,11}, \ + { 127,10}, { 255, 9}, { 511, 8}, { 1023, 9}, \ + { 527,10}, { 271, 9}, { 543,10}, { 287,11}, \ + { 159, 9}, { 639,10}, { 335, 9}, { 671,10}, \ + { 351,11}, { 191,10}, { 383, 9}, { 767,10}, \ + { 399, 9}, { 799,10}, { 415,11}, { 223,12}, \ + { 127,11}, { 255,10}, { 511, 9}, { 1023,10}, \ + { 543,11}, { 287,10}, { 671,11}, { 351,12}, \ + { 191,11}, { 383,10}, { 799,11}, { 415,10}, \ + { 831,13}, { 127,12}, { 255,11}, { 511,10}, \ + { 1023,11}, { 543,10}, { 1087,11}, { 735,12}, \ + { 383,11}, { 831,12}, { 447,11}, { 927,13}, \ + { 255,12}, { 511,11}, { 1087,12}, { 575,11}, \ + { 1151,12}, { 639,11}, { 1343,12}, { 703,13}, \ + { 383,12}, { 767,11}, { 1535,12}, { 831,11}, \ + { 1663,12}, { 959,13}, { 511,12}, { 1023,11}, \ + { 2047,12}, { 1151,13}, { 639,12}, { 1407,13}, \ + { 767,12}, { 1599,13}, { 895,12}, { 1791,14}, \ + { 511,13}, { 1023,12}, { 2111,13}, { 1151,12}, \ + { 2431,13}, { 1279,12}, { 2559,13}, { 1407,14}, \ + { 767,13}, { 1535,12}, { 3071,13}, { 1663,12}, \ + { 3455,13}, { 1791,15}, { 511,14}, { 1023,13}, \ + { 2047,12}, { 4095,13}, { 2175,12}, { 4479,13}, \ + { 2303,14}, { 1279,13}, { 2559,12}, { 5119,13}, \ + { 2815,14}, { 16384,15}, { 32768,16} } +#define SQR_FFT_TABLE3_SIZE 155 +#define SQR_FFT_THRESHOLD 5568 + +#define MULLO_BASECASE_THRESHOLD 0 /* always */ +#define MULLO_DC_THRESHOLD 37 +#define MULLO_MUL_N_THRESHOLD 13463 +#define SQRLO_BASECASE_THRESHOLD 12 +#define SQRLO_DC_THRESHOLD 22 +#define SQRLO_SQR_THRESHOLD 10950 + +#define DC_DIV_QR_THRESHOLD 32 +#define DC_DIVAPPR_Q_THRESHOLD 99 +#define DC_BDIV_QR_THRESHOLD 43 +#define DC_BDIV_Q_THRESHOLD 102 + +#define INV_MULMOD_BNM1_THRESHOLD 88 +#define INV_NEWTON_THRESHOLD 141 +#define INV_APPR_THRESHOLD 111 + +#define BINV_NEWTON_THRESHOLD 312 +#define REDC_1_TO_REDC_2_THRESHOLD 6 +#define REDC_2_TO_REDC_N_THRESHOLD 140 + +#define MU_DIV_QR_THRESHOLD 2492 +#define MU_DIVAPPR_Q_THRESHOLD 2130 +#define MUPI_DIV_QR_THRESHOLD 55 +#define MU_BDIV_QR_THRESHOLD 2130 +#define MU_BDIV_Q_THRESHOLD 2172 + +#define POWM_SEC_TABLE 40,53,56,71,1985 + +#define GET_STR_DC_THRESHOLD 16 +#define GET_STR_PRECOMPUTE_THRESHOLD 33 +#define SET_STR_DC_THRESHOLD 172 +#define SET_STR_PRECOMPUTE_THRESHOLD 671 + +#define FAC_DSC_THRESHOLD 309 +#define FAC_ODD_THRESHOLD 29 + +#define MATRIX22_STRASSEN_THRESHOLD 24 +#define HGCD_THRESHOLD 61 +#define HGCD_APPR_THRESHOLD 50 +#define HGCD_REDUCE_THRESHOLD 4120 +#define GCD_DC_THRESHOLD 408 +#define GCDEXT_DC_THRESHOLD 303 +#define JACOBI_BASE_METHOD 4 -- cgit v1.2.3