diff options
author | Duncan Wilkie <antigravityd@gmail.com> | 2023-11-18 06:11:09 -0600 |
---|---|---|
committer | Duncan Wilkie <antigravityd@gmail.com> | 2023-11-18 06:11:09 -0600 |
commit | 11da511c784eca003deb90c23570f0873954e0de (patch) | |
tree | e14fdd3d5d6345956d67e79ae771d0633d28362b /gmp-6.3.0/mpn/arm/v7a/cora15/neon |
Initial commit.
Diffstat (limited to 'gmp-6.3.0/mpn/arm/v7a/cora15/neon')
-rw-r--r-- | gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm | 43 | ||||
-rw-r--r-- | gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm | 43 | ||||
-rw-r--r-- | gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm | 144 | ||||
-rw-r--r-- | gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm | 97 | ||||
-rw-r--r-- | gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm | 110 | ||||
-rw-r--r-- | gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm | 90 | ||||
-rw-r--r-- | gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm | 177 |
7 files changed, 704 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm new file mode 100644 index 0000000..d8cfe3f --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh1_n.asm @@ -0,0 +1,43 @@ +dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 1) + +ifdef(`OPERATION_addlsh1_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh1_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh1_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n mpn_rsblsh1_n) + +include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm new file mode 100644 index 0000000..b48204d --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlsh2_n.asm @@ -0,0 +1,43 @@ +dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +define(LSH, 2) + +ifdef(`OPERATION_addlsh2_n',`define(`DO_add')') +ifdef(`OPERATION_sublsh2_n',`define(`DO_sub')') +ifdef(`OPERATION_rsblsh2_n',`define(`DO_rsb')') + +MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_sublsh2_n mpn_rsblsh2_n) + +include_mpn(`arm/v7a/cora15/neon/aorsorrlshC_n.asm') diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm new file mode 100644 index 0000000..51f93c1 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/aorsorrlshC_n.asm @@ -0,0 +1,144 @@ +dnl ARM mpn_addlshC_n, mpn_sublshC_n, mpn_rsblshC_n + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 5.25 +C Cortex-A15 2.25 + +C TODO +C * Consider using 4-way feed-in code. +C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps +C insufficiently for A7 and A8. + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +ifdef(`DO_add', ` + define(`ADCSBCS', `adcs $1, $2, $3') + define(`CLRCY', `cmn r13, #1') + define(`RETVAL', `adc r0, $1, #0') + define(`func', mpn_addlsh`'LSH`'_n)') +ifdef(`DO_sub', ` + define(`ADCSBCS', `sbcs $1, $2, $3') + define(`CLRCY', `cmp r13, #0') + define(`RETVAL', `sbc $2, $2, $2 + cmn $2, #1 + adc r0, $1, #0') + define(`func', mpn_sublsh`'LSH`'_n)') +ifdef(`DO_rsb', ` + define(`ADCSBCS', `sbcs $1, $3, $2') + define(`CLRCY', `cmp r13, #0') + define(`RETVAL', `sbc r0, $1, #0') + define(`func', mpn_rsblsh`'LSH`'_n)') + + +ASM_START() +PROLOGUE(func) + push {r4-r10} + vmov.i8 d0, #0 C could feed carry through here + CLRCY + tst n, #1 + beq L(bb0) + +L(bb1): vld1.32 {d3[0]}, [vp]! + vsli.u32 d0, d3, #LSH + ldr r12, [up], #4 + vmov.32 r5, d0[0] + vshr.u32 d0, d3, #32-LSH + ADCSBCS( r12, r12, r5) + str r12, [rp], #4 + bics n, n, #1 + beq L(rtn) + +L(bb0): tst n, #2 + beq L(b00) + +L(b10): vld1.32 {d3}, [vp]! + vsli.u64 d0, d3, #LSH + ldmia up!, {r10,r12} + vmov r4, r5, d0 + vshr.u64 d0, d3, #64-LSH + ADCSBCS( r10, r10, r4) + ADCSBCS( r12, r12, r5) + stmia rp!, {r10,r12} + bics n, n, #2 + beq L(rtn) + +L(b00): vld1.32 {d2}, [vp]! + vsli.u64 d0, d2, #LSH + vshr.u64 d1, d2, #64-LSH + vld1.32 {d3}, [vp]! + vsli.u64 d1, d3, #LSH + vmov r6, r7, d0 + vshr.u64 d0, d3, #64-LSH + sub n, n, #4 + tst n, n + beq L(end) + + ALIGN(16) +L(top): ldmia up!, {r8,r9,r10,r12} + vld1.32 {d2}, [vp]! + vsli.u64 d0, d2, #LSH + vmov r4, r5, d1 + vshr.u64 d1, d2, #64-LSH + ADCSBCS( r8, r8, r6) + ADCSBCS( r9, r9, r7) + vld1.32 {d3}, [vp]! + vsli.u64 d1, d3, #LSH + vmov r6, r7, d0 + vshr.u64 d0, d3, #64-LSH + ADCSBCS( r10, r10, r4) + ADCSBCS( r12, r12, r5) + stmia rp!, {r8,r9,r10,r12} + sub n, n, #4 + tst n, n + bne L(top) + +L(end): ldmia up!, {r8,r9,r10,r12} + vmov r4, r5, d1 + ADCSBCS( r8, r8, r6) + ADCSBCS( r9, r9, r7) + ADCSBCS( r10, r10, r4) + ADCSBCS( r12, r12, r5) + stmia rp!, {r8,r9,r10,r12} +L(rtn): vmov.32 r0, d0[0] + RETVAL( r0, r1) + pop {r4-r10} + bx r14 +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm new file mode 100644 index 0000000..9e7a629 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/com.asm @@ -0,0 +1,97 @@ +dnl ARM Neon mpn_com optimised for A15. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM ? +C XScale ? +C Cortex-A8 ? +C Cortex-A9 2.1 +C Cortex-A15 0.65 + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_com) + cmp n, #7 + ble L(bc) + +C Perform a few initial operation until rp is 128-bit aligned + tst rp, #4 + beq L(al1) + vld1.32 {d0[0]}, [up]! + sub n, n, #1 + vmvn d0, d0 + vst1.32 {d0[0]}, [rp]! +L(al1): tst rp, #8 + beq L(al2) + vld1.32 {d0}, [up]! + sub n, n, #2 + vmvn d0, d0 + vst1.32 {d0}, [rp:64]! +L(al2): vld1.32 {q2}, [up]! + subs n, n, #12 + blt L(end) + + ALIGN(16) +L(top): vld1.32 {q0}, [up]! + vmvn q2, q2 + subs n, n, #8 + vst1.32 {q2}, [rp:128]! + vld1.32 {q2}, [up]! + vmvn q0, q0 + vst1.32 {q0}, [rp:128]! + bge L(top) + +L(end): vmvn q2, q2 + vst1.32 {q2}, [rp:128]! + +C Handle last 0-7 limbs. Note that rp is aligned after loop, but not when we +C arrive here via L(bc) +L(bc): tst n, #4 + beq L(tl1) + vld1.32 {q0}, [up]! + vmvn q0, q0 + vst1.32 {q0}, [rp]! +L(tl1): tst n, #2 + beq L(tl2) + vld1.32 {d0}, [up]! + vmvn d0, d0 + vst1.32 {d0}, [rp]! +L(tl2): tst n, #1 + beq L(tl3) + vld1.32 {d0[0]}, [up] + vmvn d0, d0 + vst1.32 {d0[0]}, [rp] +L(tl3): bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm new file mode 100644 index 0000000..98fe535 --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyd.asm @@ -0,0 +1,110 @@ +dnl ARM Neon mpn_copyd optimised for A15. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.75 slower than core register code +C Cortex-A15 0.52 + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_copyd) + add rp, rp, n, lsl #2 + add up, up, n, lsl #2 + + cmp n, #7 + ble L(bc) + +C Copy until rp is 128-bit aligned + tst rp, #4 + beq L(al1) + sub up, up, #4 + vld1.32 {d22[0]}, [up] + sub n, n, #1 + sub rp, rp, #4 + vst1.32 {d22[0]}, [rp] +L(al1): tst rp, #8 + beq L(al2) + sub up, up, #8 + vld1.32 {d22}, [up] + sub n, n, #2 + sub rp, rp, #8 + vst1.32 {d22}, [rp:64] +L(al2): sub up, up, #16 + vld1.32 {d26-d27}, [up] + subs n, n, #12 + sub rp, rp, #16 C offset rp for loop + blt L(end) + + sub up, up, #16 C offset up for loop + mov r12, #-16 + + ALIGN(16) +L(top): vld1.32 {d22-d23}, [up], r12 + vst1.32 {d26-d27}, [rp:128], r12 + vld1.32 {d26-d27}, [up], r12 + vst1.32 {d22-d23}, [rp:128], r12 + subs n, n, #8 + bge L(top) + + add up, up, #16 C undo up offset + C rp offset undoing folded +L(end): vst1.32 {d26-d27}, [rp:128] + +C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we +C arrive here via L(bc) +L(bc): tst n, #4 + beq L(tl1) + sub up, up, #16 + vld1.32 {d22-d23}, [up] + sub rp, rp, #16 + vst1.32 {d22-d23}, [rp] +L(tl1): tst n, #2 + beq L(tl2) + sub up, up, #8 + vld1.32 {d22}, [up] + sub rp, rp, #8 + vst1.32 {d22}, [rp] +L(tl2): tst n, #1 + beq L(tl3) + sub up, up, #4 + vld1.32 {d22[0]}, [up] + sub rp, rp, #4 + vst1.32 {d22[0]}, [rp] +L(tl3): bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm new file mode 100644 index 0000000..2e05afe --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/copyi.asm @@ -0,0 +1,90 @@ +dnl ARM Neon mpn_copyi optimised for A15. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 1.75 slower than core register code +C Cortex-A15 0.52 + +define(`rp', `r0') +define(`up', `r1') +define(`n', `r2') + +ASM_START() +PROLOGUE(mpn_copyi) + cmp n, #7 + ble L(bc) + +C Copy until rp is 128-bit aligned + tst rp, #4 + beq L(al1) + vld1.32 {d22[0]}, [up]! + sub n, n, #1 + vst1.32 {d22[0]}, [rp]! +L(al1): tst rp, #8 + beq L(al2) + vld1.32 {d22}, [up]! + sub n, n, #2 + vst1.32 {d22}, [rp:64]! +L(al2): vld1.32 {d26-d27}, [up]! + subs n, n, #12 + blt L(end) + + ALIGN(16) +L(top): vld1.32 {d22-d23}, [up]! + vst1.32 {d26-d27}, [rp:128]! + vld1.32 {d26-d27}, [up]! + vst1.32 {d22-d23}, [rp:128]! + subs n, n, #8 + bge L(top) + +L(end): vst1.32 {d26-d27}, [rp:128]! + +C Copy last 0-7 limbs. Note that rp is aligned after loop, but not when we +C arrive here via L(bc) +L(bc): tst n, #4 + beq L(tl1) + vld1.32 {d22-d23}, [up]! + vst1.32 {d22-d23}, [rp]! +L(tl1): tst n, #2 + beq L(tl2) + vld1.32 {d22}, [up]! + vst1.32 {d22}, [rp]! +L(tl2): tst n, #1 + beq L(tl3) + vld1.32 {d22[0]}, [up] + vst1.32 {d22[0]}, [rp] +L(tl3): bx lr +EPILOGUE() diff --git a/gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm new file mode 100644 index 0000000..2c11d6d --- /dev/null +++ b/gmp-6.3.0/mpn/arm/v7a/cora15/neon/rsh1aors_n.asm @@ -0,0 +1,177 @@ +dnl ARM Neon mpn_rsh1add_n, mpn_rsh1sub_n. + +dnl Contributed to the GNU project by Torbjörn Granlund. + +dnl Copyright 2013 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C StrongARM - +C XScale - +C Cortex-A7 ? +C Cortex-A8 ? +C Cortex-A9 4-5 +C Cortex-A15 2.5 + +C TODO +C * Try to make this smaller, its size (384 bytes) is excessive. +C * Try to reach 2.25 c/l on A15, to match the addlsh_1 family. +C * This is ad-hoc scheduled, perhaps unnecessarily so for A15, and perhaps +C insufficiently for A7 and A8. + +define(`rp', `r0') +define(`up', `r1') +define(`vp', `r2') +define(`n', `r3') + +ifdef(`OPERATION_rsh1add_n', ` + define(`ADDSUBS', `adds $1, $2, $3') + define(`ADCSBCS', `adcs $1, $2, $3') + define(`IFADD', `$1') + define(`IFSUB', `') + define(`func', mpn_rsh1add_n)') +ifdef(`OPERATION_rsh1sub_n', ` + define(`ADDSUBS', `subs $1, $2, $3') + define(`ADCSBCS', `sbcs $1, $2, $3') + define(`IFADD', `') + define(`IFSUB', `$1') + define(`func', mpn_rsh1sub_n)') + +MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) + +ASM_START() +PROLOGUE(func) + push {r4-r10} + + ands r4, n, #3 + beq L(b00) + cmp r4, #2 + blo L(b01) + beq L(b10) + +L(b11): ldmia up!, {r9,r10,r12} + ldmia vp!, {r5,r6,r7} + ADDSUBS( r9, r9, r5) + vmov d4, r9, r9 + ADCSBCS( r10, r10, r6) + ADCSBCS( r12, r12, r7) + vshr.u64 d3, d4, #1 + vmov d1, r10, r12 + vsli.u64 d3, d1, #31 + vshr.u64 d2, d1, #1 + vst1.32 d3[0], [rp]! + bics n, n, #3 + beq L(wd2) +L(gt3): ldmia up!, {r8,r9,r10,r12} + ldmia vp!, {r4,r5,r6,r7} + b L(mi0) + +L(b10): ldmia up!, {r10,r12} + ldmia vp!, {r6,r7} + ADDSUBS( r10, r10, r6) + ADCSBCS( r12, r12, r7) + vmov d4, r10, r12 + bics n, n, #2 + vshr.u64 d2, d4, #1 + beq L(wd2) +L(gt2): ldmia up!, {r8,r9,r10,r12} + ldmia vp!, {r4,r5,r6,r7} + b L(mi0) + +L(b01): ldr r12, [up], #4 + ldr r7, [vp], #4 + ADDSUBS( r12, r12, r7) + vmov d4, r12, r12 + bics n, n, #1 + bne L(gt1) + mov r5, r12, lsr #1 +IFADD(` adc r1, n, #0') +IFSUB(` adc r1, n, #1') + bfi r5, r1, #31, #1 + str r5, [rp] + and r0, r12, #1 + pop {r4-r10} + bx r14 +L(gt1): ldmia up!, {r8,r9,r10,r12} + ldmia vp!, {r4,r5,r6,r7} + vshr.u64 d2, d4, #1 + ADCSBCS( r8, r8, r4) + ADCSBCS( r9, r9, r5) + vmov d0, r8, r9 + ADCSBCS( r10, r10, r6) + ADCSBCS( r12, r12, r7) + vsli.u64 d2, d0, #31 + vshr.u64 d3, d0, #1 + vst1.32 d2[0], [rp]! + b L(mi1) + +L(b00): ldmia up!, {r8,r9,r10,r12} + ldmia vp!, {r4,r5,r6,r7} + ADDSUBS( r8, r8, r4) + ADCSBCS( r9, r9, r5) + vmov d4, r8, r9 + ADCSBCS( r10, r10, r6) + ADCSBCS( r12, r12, r7) + vshr.u64 d3, d4, #1 + b L(mi1) + + ALIGN(16) +L(top): ldmia up!, {r8,r9,r10,r12} + ldmia vp!, {r4,r5,r6,r7} + vsli.u64 d3, d1, #63 + vshr.u64 d2, d1, #1 + vst1.32 d3, [rp]! +L(mi0): ADCSBCS( r8, r8, r4) + ADCSBCS( r9, r9, r5) + vmov d0, r8, r9 + ADCSBCS( r10, r10, r6) + ADCSBCS( r12, r12, r7) + vsli.u64 d2, d0, #63 + vshr.u64 d3, d0, #1 + vst1.32 d2, [rp]! +L(mi1): vmov d1, r10, r12 + sub n, n, #4 + tst n, n + bne L(top) + +L(end): vsli.u64 d3, d1, #63 + vshr.u64 d2, d1, #1 + vst1.32 d3, [rp]! +L(wd2): vmov r4, r5, d2 +IFADD(` adc r1, n, #0') +IFSUB(` adc r1, n, #1') + bfi r5, r1, #31, #1 + stm rp, {r4,r5} + +L(rtn): vmov.32 r0, d4[0] + and r0, r0, #1 + pop {r4-r10} + bx r14 +EPILOGUE() |