aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/powerpc32/vmx
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/powerpc32/vmx')
-rw-r--r--gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm203
-rw-r--r--gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm198
-rw-r--r--gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm310
-rw-r--r--gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm388
-rw-r--r--gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm34
5 files changed, 1133 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm b/gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm
new file mode 100644
index 0000000..dee7266
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc32/vmx/copyd.asm
@@ -0,0 +1,203 @@
+dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyd.
+
+dnl Copyright 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C 16-byte coaligned unaligned
+C cycles/limb cycles/limb
+C 7400,7410 (G4): 0.5 0.64
+C 744x,745x (G4+): 0.75 0.82
+C 970 (G5): 0.78 1.02 (64-bit limbs)
+
+C STATUS
+C * Works for all sizes and alignments.
+
+C TODO
+C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling
+C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
+C c/l for 970.
+C * Consider using VMX instructions also for head and tail, by using some
+C read-modify-write tricks.
+C * The VMX code is used from the smallest sizes it handles, but measurements
+C show a large speed bump at the cutoff points. Small copying (perhaps
+C using some read-modify-write technique) should be optimized.
+C * Make an mpn_com based on this code.
+
+define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
+define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
+define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
+
+
+ifelse(GMP_LIMB_BITS,32,`
+ define(`LIMB32',` $1')
+ define(`LIMB64',`')
+',`
+ define(`LIMB32',`')
+ define(`LIMB64',` $1')
+')
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+
+define(`us', `v4')
+
+
+ASM_START()
+PROLOGUE(mpn_copyd)
+
+LIMB32(`slwi. r0, n, 2 ')
+LIMB64(`sldi. r0, n, 3 ')
+ add rp, rp, r0
+ add up, up, r0
+
+LIMB32(`cmpi cr7, n, 11 ')
+LIMB64(`cmpdi cr7, n, 5 ')
+ bge cr7, L(big)
+
+ beqlr cr0
+
+C Handle small cases with plain operations
+ mtctr n
+L(topS):
+LIMB32(`lwz r0, -4(up) ')
+LIMB64(`ld r0, -8(up) ')
+ addi up, up, -GMP_LIMB_BYTES
+LIMB32(`stw r0, -4(rp) ')
+LIMB64(`std r0, -8(rp) ')
+ addi rp, rp, -GMP_LIMB_BYTES
+ bdnz L(topS)
+ blr
+
+C Handle large cases with VMX operations
+L(big):
+ addi rp, rp, -16
+ addi up, up, -16
+ mfspr r12, 256
+ oris r0, r12, 0xf800 C Set VRSAVE bit 0-4
+ mtspr 256, r0
+
+LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4
+LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2
+ beq L(rp_aligned)
+
+ subf n, r7, n
+L(top0):
+LIMB32(`lwz r0, 12(up) ')
+LIMB64(`ld r0, 8(up) ')
+ addi up, up, -GMP_LIMB_BYTES
+LIMB32(`addic. r7, r7, -1 ')
+LIMB32(`stw r0, 12(rp) ')
+LIMB64(`std r0, 8(rp) ')
+ addi rp, rp, -GMP_LIMB_BYTES
+LIMB32(`bne L(top0) ')
+
+L(rp_aligned):
+
+LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4
+LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2
+
+LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n
+LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n
+ mtctr r7 C copy n to count register
+
+ li r10, -16
+
+ beq L(up_aligned)
+
+ lvsl us, 0, up
+
+ addi up, up, 16
+LIMB32(`andi. r0, n, 0x4 ')
+LIMB64(`andi. r0, n, 0x2 ')
+ beq L(1)
+ lvx v0, 0, up
+ lvx v2, r10, up
+ vperm v3, v2, v0, us
+ stvx v3, 0, rp
+ addi up, up, -32
+ addi rp, rp, -16
+ b L(lpu)
+L(1): lvx v2, 0, up
+ addi up, up, -16
+ b L(lpu)
+
+ ALIGN(32)
+L(lpu): lvx v0, 0, up
+ vperm v3, v0, v2, us
+ stvx v3, 0, rp
+ lvx v2, r10, up
+ addi up, up, -32
+ vperm v3, v2, v0, us
+ stvx v3, r10, rp
+ addi rp, rp, -32
+ bdnz L(lpu)
+
+ b L(tail)
+
+L(up_aligned):
+
+LIMB32(`andi. r0, n, 0x4 ')
+LIMB64(`andi. r0, n, 0x2 ')
+ beq L(lpa)
+ lvx v0, 0, up
+ stvx v0, 0, rp
+ addi up, up, -16
+ addi rp, rp, -16
+ b L(lpa)
+
+ ALIGN(32)
+L(lpa): lvx v0, 0, up
+ lvx v1, r10, up
+ addi up, up, -32
+ nop
+ stvx v0, 0, rp
+ stvx v1, r10, rp
+ addi rp, rp, -32
+ bdnz L(lpa)
+
+L(tail):
+LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4
+LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2
+ beq L(ret)
+LIMB32(`li r10, 12 ')
+L(top2):
+LIMB32(`lwzx r0, r10, up ')
+LIMB64(`ld r0, 8(up) ')
+LIMB32(`addic. r7, r7, -1 ')
+LIMB32(`stwx r0, r10, rp ')
+LIMB64(`std r0, 8(rp) ')
+LIMB32(`addi r10, r10, -GMP_LIMB_BYTES')
+LIMB32(`bne L(top2) ')
+
+L(ret): mtspr 256, r12
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm b/gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm
new file mode 100644
index 0000000..992b468
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc32/vmx/copyi.asm
@@ -0,0 +1,198 @@
+dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyi.
+
+dnl Copyright 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C 16-byte coaligned unaligned
+C cycles/limb cycles/limb
+C 7400,7410 (G4): 0.5 0.64
+C 744x,745x (G4+): 0.75 0.82
+C 970 (G5): 0.78 1.02 (64-bit limbs)
+
+C STATUS
+C * Works for all sizes and alignments.
+
+C TODO
+C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling
+C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80
+C c/l for 970.
+C * Consider using VMX instructions also for head and tail, by using some
+C read-modify-write tricks.
+C * The VMX code is used from the smallest sizes it handles, but measurements
+C show a large speed bump at the cutoff points. Small copying (perhaps
+C using some read-modify-write technique) should be optimized.
+C * Make an mpn_com based on this code.
+
+define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
+define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
+define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
+
+
+ifelse(GMP_LIMB_BITS,32,`
+ define(`LIMB32',` $1')
+ define(`LIMB64',`')
+',`
+ define(`LIMB32',`')
+ define(`LIMB64',` $1')
+')
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+
+define(`us', `v4')
+
+
+ASM_START()
+PROLOGUE(mpn_copyi)
+
+LIMB32(`cmpi cr7, n, 11 ')
+LIMB64(`cmpdi cr7, n, 5 ')
+ bge cr7, L(big)
+
+ or. r0, n, n
+ beqlr cr0
+
+C Handle small cases with plain operations
+ mtctr n
+L(topS):
+LIMB32(`lwz r0, 0(up) ')
+LIMB64(`ld r0, 0(up) ')
+ addi up, up, GMP_LIMB_BYTES
+LIMB32(`stw r0, 0(rp) ')
+LIMB64(`std r0, 0(rp) ')
+ addi rp, rp, GMP_LIMB_BYTES
+ bdnz L(topS)
+ blr
+
+C Handle large cases with VMX operations
+L(big):
+ mfspr r12, 256
+ oris r0, r12, 0xf800 C Set VRSAVE bit 0-4
+ mtspr 256, r0
+
+LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4
+LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2
+ beq L(rp_aligned)
+
+ subfic r7, r7, LIMBS_PER_VR
+ subf n, r7, n
+L(top0):
+LIMB32(`lwz r0, 0(up) ')
+LIMB64(`ld r0, 0(up) ')
+ addi up, up, GMP_LIMB_BYTES
+LIMB32(`addic. r7, r7, -1 ')
+LIMB32(`stw r0, 0(rp) ')
+LIMB64(`std r0, 0(rp) ')
+ addi rp, rp, GMP_LIMB_BYTES
+LIMB32(`bne L(top0) ')
+
+L(rp_aligned):
+
+LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4
+LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2
+
+LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n
+LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n
+ mtctr r7 C copy n to count register
+
+ li r10, 16
+
+ beq L(up_aligned)
+
+ lvsl us, 0, up
+
+LIMB32(`andi. r0, n, 0x4 ')
+LIMB64(`andi. r0, n, 0x2 ')
+ beq L(1)
+ lvx v0, 0, up
+ lvx v2, r10, up
+ vperm v3, v0, v2, us
+ stvx v3, 0, rp
+ addi up, up, 32
+ addi rp, rp, 16
+ b L(lpu)
+L(1): lvx v2, 0, up
+ addi up, up, 16
+ b L(lpu)
+
+ ALIGN(32)
+L(lpu): lvx v0, 0, up
+ vperm v3, v2, v0, us
+ stvx v3, 0, rp
+ lvx v2, r10, up
+ addi up, up, 32
+ vperm v3, v0, v2, us
+ stvx v3, r10, rp
+ addi rp, rp, 32
+ bdnz L(lpu)
+
+ addi up, up, -16
+ b L(tail)
+
+L(up_aligned):
+
+LIMB32(`andi. r0, n, 0x4 ')
+LIMB64(`andi. r0, n, 0x2 ')
+ beq L(lpa)
+ lvx v0, 0, up
+ stvx v0, 0, rp
+ addi up, up, 16
+ addi rp, rp, 16
+ b L(lpa)
+
+ ALIGN(32)
+L(lpa): lvx v0, 0, up
+ lvx v1, r10, up
+ addi up, up, 32
+ nop
+ stvx v0, 0, rp
+ stvx v1, r10, rp
+ addi rp, rp, 32
+ bdnz L(lpa)
+
+L(tail):
+LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4
+LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2
+ beq L(ret)
+LIMB32(`li r10, 0 ')
+L(top2):
+LIMB32(`lwzx r0, r10, up ')
+LIMB64(`ld r0, 0(up) ')
+LIMB32(`addic. r7, r7, -1 ')
+LIMB32(`stwx r0, r10, rp ')
+LIMB64(`std r0, 0(rp) ')
+LIMB32(`addi r10, r10, GMP_LIMB_BYTES')
+LIMB32(`bne L(top2) ')
+
+L(ret): mtspr 256, r12
+ blr
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm b/gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm
new file mode 100644
index 0000000..d656d3b
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc32/vmx/logops_n.asm
@@ -0,0 +1,310 @@
+dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_and_n, mpn_andn_n, mpn_nand_n,
+dnl mpn_ior_n, mpn_iorn_n, mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise
+dnl logical operations.
+
+dnl Copyright 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C and,ior,andn,nior,xor iorn,xnor nand
+C cycles/limb cycles/limb cycles/limb
+C 7400,7410 (G4): 1.39 ? ?
+C 744x,745x (G4+): 1.14 1.39 1.39
+C 970: 1.7 2.0 2.0
+
+C STATUS
+C * Works for all sizes and alignment for 32-bit limbs.
+C * Works for n >= 4 for 64-bit limbs; untested for smaller operands.
+C * Current performance makes this pointless for 970
+
+C TODO
+C * Might want to make variants when just one of the source operands needs
+C vperm, and when neither needs it. The latter runs 50% faster on 7400.
+C * Idea: If the source operands are equally aligned, we could do the logops
+C first, then vperm before storing! That means we never need more than one
+C vperm, ever!
+C * Perhaps align `rp' after initial alignment loop?
+C * Instead of having scalar code in the beginning and end, consider using
+C read-modify-write vector code.
+C * Software pipeline? Hopefully not too important, this is hairy enough
+C already.
+C * At least be more clever about operand loading, i.e., load v operands before
+C u operands, since v operands are sometimes negated.
+
+define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
+define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES))
+define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
+
+define(`vnegb', `') C default neg-before to null
+define(`vnega', `') C default neg-before to null
+
+ifdef(`OPERATION_and_n',
+` define(`func', `mpn_and_n')
+ define(`logopS',`and $1,$2,$3')
+ define(`logop', `vand $1,$2,$3')')
+ifdef(`OPERATION_andn_n',
+` define(`func', `mpn_andn_n')
+ define(`logopS',`andc $1,$2,$3')
+ define(`logop', `vandc $1,$2,$3')')
+ifdef(`OPERATION_nand_n',
+` define(`func', `mpn_nand_n')
+ define(`logopS',`nand $1,$2,$3')
+ define(`logop', `vand $1,$2,$3')
+ define(`vnega', `vnor $1,$2,$2')')
+ifdef(`OPERATION_ior_n',
+` define(`func', `mpn_ior_n')
+ define(`logopS',`or $1,$2,$3')
+ define(`logop', `vor $1,$2,$3')')
+ifdef(`OPERATION_iorn_n',
+` define(`func', `mpn_iorn_n')
+ define(`logopS',`orc $1,$2,$3')
+ define(`vnegb', `vnor $1,$2,$2')
+ define(`logop', `vor $1,$2,$3')')
+ifdef(`OPERATION_nior_n',
+` define(`func', `mpn_nior_n')
+ define(`logopS',`nor $1,$2,$3')
+ define(`logop', `vnor $1,$2,$3')')
+ifdef(`OPERATION_xor_n',
+` define(`func', `mpn_xor_n')
+ define(`logopS',`xor $1,$2,$3')
+ define(`logop', `vxor $1,$2,$3')')
+ifdef(`OPERATION_xnor_n',
+` define(`func',`mpn_xnor_n')
+ define(`logopS',`eqv $1,$2,$3')
+ define(`vnegb', `vnor $1,$2,$2')
+ define(`logop', `vxor $1,$2,$3')')
+
+ifelse(GMP_LIMB_BITS,`32',`
+ define(`LIMB32',` $1')
+ define(`LIMB64',`')
+',`
+ define(`LIMB32',`')
+ define(`LIMB64',` $1')
+')
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`vp', `r5')
+define(`n', `r6')
+
+define(`us', `v8')
+define(`vs', `v9')
+
+MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
+
+ASM_START()
+PROLOGUE(func)
+
+LIMB32(`cmpwi cr0, n, 8 ')
+LIMB64(`cmpdi cr0, n, 4 ')
+ bge L(big)
+
+ mtctr n
+
+LIMB32(`lwz r8, 0(up) ')
+LIMB32(`lwz r9, 0(vp) ')
+LIMB32(`logopS( r0, r8, r9) ')
+LIMB32(`stw r0, 0(rp) ')
+LIMB32(`bdz L(endS) ')
+
+L(topS):
+LIMB32(`lwzu r8, 4(up) ')
+LIMB64(`ld r8, 0(up) ')
+LIMB64(`addi up, up, GMP_LIMB_BYTES ')
+LIMB32(`lwzu r9, 4(vp) ')
+LIMB64(`ld r9, 0(vp) ')
+LIMB64(`addi vp, vp, GMP_LIMB_BYTES ')
+ logopS( r0, r8, r9)
+LIMB32(`stwu r0, 4(rp) ')
+LIMB64(`std r0, 0(rp) ')
+LIMB64(`addi rp, rp, GMP_LIMB_BYTES ')
+ bdnz L(topS)
+L(endS):
+ blr
+
+L(big): mfspr r12, 256
+ oris r0, r12, 0xfffc C Set VRSAVE bit 0-13 FIXME
+ mtspr 256, r0
+
+C First loop until the destination is 16-byte aligned. This will execute 0 or 1
+C times for 64-bit machines, and 0 to 3 times for 32-bit machines.
+
+LIMB32(`rlwinm. r0, rp, 30,30,31') C (rp >> 2) mod 4
+LIMB64(`rlwinm. r0, rp, 29,31,31') C (rp >> 3) mod 2
+ beq L(aligned)
+
+ subfic r7, r0, LIMBS_PER_VR
+LIMB32(`li r10, 0 ')
+ subf n, r7, n
+L(top0):
+LIMB32(`lwz r8, 0(up) ')
+LIMB64(`ld r8, 0(up) ')
+ addi up, up, GMP_LIMB_BYTES
+LIMB32(`lwz r9, 0(vp) ')
+LIMB64(`ld r9, 0(vp) ')
+ addi vp, vp, GMP_LIMB_BYTES
+LIMB32(`addic. r7, r7, -1 ')
+ logopS( r0, r8, r9)
+LIMB32(`stwx r0, r10, rp ')
+LIMB64(`std r0, 0(rp) ')
+LIMB32(`addi r10, r10, GMP_LIMB_BYTES')
+LIMB32(`bne L(top0) ')
+
+ addi rp, rp, 16 C update rp, but preserve its alignment
+
+L(aligned):
+LIMB64(`srdi r7, n, 1 ') C loop count corresponding to n
+LIMB32(`srwi r7, n, 2 ') C loop count corresponding to n
+ mtctr r7 C copy n to count register
+
+ li r10, 16
+ lvsl us, 0, up
+ lvsl vs, 0, vp
+
+ lvx v2, 0, up
+ lvx v3, 0, vp
+ bdnz L(gt1)
+ lvx v0, r10, up
+ lvx v1, r10, vp
+ vperm v4, v2, v0, us
+ vperm v5, v3, v1, vs
+ vnegb( v5, v5)
+ logop( v6, v4, v5)
+ vnega( v6, v6)
+ stvx v6, 0, rp
+ addi up, up, 16
+ addi vp, vp, 16
+ addi rp, rp, 4
+ b L(tail)
+
+L(gt1): addi up, up, 16
+ addi vp, vp, 16
+
+L(top): lvx v0, 0, up
+ lvx v1, 0, vp
+ vperm v4, v2, v0, us
+ vperm v5, v3, v1, vs
+ vnegb( v5, v5)
+ logop( v6, v4, v5)
+ vnega( v6, v6)
+ stvx v6, 0, rp
+ bdz L(end)
+ lvx v2, r10, up
+ lvx v3, r10, vp
+ vperm v4, v0, v2, us
+ vperm v5, v1, v3, vs
+ vnegb( v5, v5)
+ logop( v6, v4, v5)
+ vnega( v6, v6)
+ stvx v6, r10, rp
+ addi up, up, 32
+ addi vp, vp, 32
+ addi rp, rp, 32
+ bdnz L(top)
+
+ andi. r0, up, 15
+ vxor v0, v0, v0
+ beq 1f
+ lvx v0, 0, up
+1: andi. r0, vp, 15
+ vxor v1, v1, v1
+ beq 1f
+ lvx v1, 0, vp
+1: vperm v4, v2, v0, us
+ vperm v5, v3, v1, vs
+ vnegb( v5, v5)
+ logop( v6, v4, v5)
+ vnega( v6, v6)
+ stvx v6, 0, rp
+ addi rp, rp, 4
+ b L(tail)
+
+L(end): andi. r0, up, 15
+ vxor v2, v2, v2
+ beq 1f
+ lvx v2, r10, up
+1: andi. r0, vp, 15
+ vxor v3, v3, v3
+ beq 1f
+ lvx v3, r10, vp
+1: vperm v4, v0, v2, us
+ vperm v5, v1, v3, vs
+ vnegb( v5, v5)
+ logop( v6, v4, v5)
+ vnega( v6, v6)
+ stvx v6, r10, rp
+
+ addi up, up, 16
+ addi vp, vp, 16
+ addi rp, rp, 20
+
+L(tail):
+LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4
+LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2
+ beq L(ret)
+ addi rp, rp, 15
+LIMB32(`rlwinm rp, rp, 0,0,27 ')
+LIMB64(`rldicr rp, rp, 0,59 ')
+ li r10, 0
+L(top2):
+LIMB32(`lwzx r8, r10, up ')
+LIMB64(`ldx r8, r10, up ')
+LIMB32(`lwzx r9, r10, vp ')
+LIMB64(`ldx r9, r10, vp ')
+LIMB32(`addic. r7, r7, -1 ')
+ logopS( r0, r8, r9)
+LIMB32(`stwx r0, r10, rp ')
+LIMB64(`std r0, 0(rp) ')
+LIMB32(`addi r10, r10, GMP_LIMB_BYTES')
+LIMB32(`bne L(top2) ')
+
+L(ret): mtspr 256, r12
+ blr
+EPILOGUE()
+
+C This works for 64-bit PowerPC, since a limb ptr can only be aligned
+C in 2 relevant ways, which means we can always find a pair of aligned
+C pointers of rp, up, and vp.
+C process words until rp is 16-byte aligned
+C if (((up | vp) & 15) == 0)
+C process with VMX without any vperm
+C else if ((up & 15) != 0 && (vp & 15) != 0)
+C process with VMX using vperm on store data
+C else if ((up & 15) != 0)
+C process with VMX using vperm on up data
+C else
+C process with VMX using vperm on vp data
+C
+C rlwinm, r0, up, 0,28,31
+C rlwinm r0, vp, 0,28,31
+C cmpwi cr7, r0, 0
+C cror cr6, cr0, cr7
+C crand cr0, cr0, cr7
diff --git a/gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm b/gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm
new file mode 100644
index 0000000..2bb11cd
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc32/vmx/mod_34lsub1.asm
@@ -0,0 +1,388 @@
+dnl PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1.
+
+dnl Copyright 2002, 2003, 2005-2007, 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+
+C cycles/limb
+C 603e: -
+C 604e: -
+C 75x (G3): -
+C 7400,7410 (G4): 1 simple load-use scheduling results in 0.75
+C 744x,745x (G4+): 0.75
+C ppc970: 0.75
+C power4: -
+C power5: -
+
+C TODO
+C * Either start using the low-end masking constants, or remove them.
+C * Merge multiple feed-in cases into a parameterized code block.
+C * Reduce register usage. It should be possible to almost halve it.
+
+define(`up', `r3')
+define(`n', `r4')
+
+define(`a0', `v3')
+define(`a1', `v4')
+define(`a2', `v5')
+define(`c0', `v6')
+define(`c1', `v7')
+define(`c2', `v8')
+define(`z', `v9')
+define(`x0', `v10')
+define(`x1', `v11')
+define(`x2', `v12')
+define(`x3', `v13')
+define(`pv', `v14')
+define(`y0', `v0')
+define(`y1', `v1')
+define(`y2', `v2')
+define(`y3', `v15')
+
+ASM_START()
+PROLOGUE(mpn_mod_34lsub1)
+ cmpwi cr0, n, 20 C tuned cutoff point
+ bge L(large)
+
+ li r9, 0 C result accumulator
+ mulli r10, n, 0xb C 0xb = ceil(32/3)
+ srwi. r10, r10, 5 C r10 = floor(n/3), n < 32
+ beq L(small_tail)
+ mtctr r10
+ lwz r6, 0(up)
+ lwz r7, 4(up)
+ lwzu r8, 8(up)
+ subf n, r10, n
+ subf n, r10, n
+ subf n, r10, n
+ bdz L(small_end)
+
+ ALIGN(16)
+L(los): rlwinm r0, r6, 0,8,31
+ add r9, r9, r0 C add 24b from u0
+ srwi r0, r6, 24
+ lwz r6, 4(up)
+ rlwimi r0, r7, 8, 0x00ffff00 C --111100
+ add r9, r9, r0 C add 8b from u0 and 16b from u1
+ srwi r0, r7, 16
+ lwz r7, 8(up)
+ rlwimi r0, r8, 16, 0x00ff0000 C --221111
+ add r9, r9, r0 C add 16b from u1 and 8b from u2
+ srwi r0, r8, 8 C --222222
+ lwzu r8, 12(up)
+ add r9, r9, r0 C add 24b from u2
+ bdnz L(los)
+L(small_end):
+ rlwinm r0, r6, 0,8,31
+ add r9, r9, r0 C add 24b from u0
+ srwi r0, r6, 24
+ rlwimi r0, r7, 8, 0x00ffff00 C --111100
+ add r9, r9, r0 C add 8b from u0 and 16b from u1
+ srwi r0, r7, 16
+ rlwimi r0, r8, 16, 0x00ff0000 C --221111
+ add r9, r9, r0 C add 16b from u1 and 8b from u2
+ srwi r0, r8, 8 C --222222
+ add r9, r9, r0 C add 24b from u2
+
+ addi up, up, 4
+ rlwinm r0, r9, 0,8,31
+ srwi r9, r9, 24
+ add r9, r9, r0
+
+L(small_tail):
+ cmpi cr0, n, 1
+ blt L(ret)
+
+ lwz r6, 0(up)
+ rlwinm r0, r6, 0,8,31
+ srwi r6, r6, 24
+ add r9, r9, r0
+ add r9, r9, r6
+
+ beq L(ret)
+
+ lwz r6, 4(up)
+ rlwinm r0, r6, 8,8,23
+ srwi r6, r6, 16
+ add r9, r9, r0
+ add r9, r9, r6
+
+L(ret): mr r3, r9
+ blr
+
+
+L(large):
+ stwu r1, -32(r1)
+ mfspr r10, 256
+ oris r0, r10, 0xffff C Set VRSAVE bit 0-15
+ mtspr 256, r0
+
+ andi. r7, up, 15
+ vxor a0, v0, v0
+ lis r9, 0xaaaa
+ vxor a1, v0, v0
+ ori r9, r9, 0xaaab
+ vxor a2, v0, v0
+ li r5, 16
+ vxor c0, v0, v0
+ li r6, 32
+ vxor c1, v0, v0
+ LEAL( r11, cnsts) C CAUTION clobbers r0 for elf, darwin
+ vxor c2, v0, v0
+ vxor z, v0, v0
+
+ beq L(aligned16)
+
+ cmpwi cr7, r7, 8
+ bge cr7, L(na4)
+
+ lvx a2, 0, up
+ addi up, up, 16
+ vsldoi a2, a2, z, 4
+ vsldoi a2, z, a2, 12
+
+ addi n, n, 9
+ mulhwu r0, n, r9
+ srwi r0, r0, 3 C r0 = floor(n/12)
+ mtctr r0
+
+ mulli r8, r0, 12
+ subf n, r8, n
+ b L(2)
+
+L(na4): bne cr7, L(na8)
+
+ lvx a1, 0, up
+ addi up, up, -16
+ vsldoi a1, a1, z, 8
+ vsldoi a1, z, a1, 8
+
+ addi n, n, 6
+ mulhwu r0, n, r9
+ srwi r0, r0, 3 C r0 = floor(n/12)
+ mtctr r0
+
+ mulli r8, r0, 12
+ subf n, r8, n
+ b L(1)
+
+L(na8):
+ lvx a0, 0, up
+ vsldoi a0, a0, z, 12
+ vsldoi a0, z, a0, 4
+
+ addi n, n, 3
+ mulhwu r0, n, r9
+ srwi r0, r0, 3 C r0 = floor(n/12)
+ mtctr r0
+
+ mulli r8, r0, 12
+ subf n, r8, n
+ b L(0)
+
+L(aligned16):
+ mulhwu r0, n, r9
+ srwi r0, r0, 3 C r0 = floor(n/12)
+ mtctr r0
+
+ mulli r8, r0, 12
+ subf n, r8, n
+
+ lvx a0, 0, up
+L(0): lvx a1, r5, up
+L(1): lvx a2, r6, up
+ addi up, up, 48
+L(2): bdz L(end)
+ li r12, 256
+ li r9, 288
+ ALIGN(32)
+L(top):
+ lvx v0, 0, up
+ vaddcuw v10, a0, v0
+ vadduwm a0, a0, v0
+ vadduwm c0, c0, v10
+
+ lvx v1, r5, up
+ vaddcuw v10, a1, v1
+ vadduwm a1, a1, v1
+ vadduwm c1, c1, v10
+
+ lvx v2, r6, up
+ dcbt up, r12
+ dcbt up, r9
+ addi up, up, 48
+ vaddcuw v10, a2, v2
+ vadduwm a2, a2, v2
+ vadduwm c2, c2, v10
+ bdnz L(top)
+
+L(end):
+C n = 0...11
+ cmpwi cr0, n, 0
+ beq L(sum)
+ cmpwi cr0, n, 4
+ ble L(tail.1..4)
+ cmpwi cr0, n, 8
+ ble L(tail.5..8)
+
+L(tail.9..11):
+ lvx v0, 0, up
+ vaddcuw v10, a0, v0
+ vadduwm a0, a0, v0
+ vadduwm c0, c0, v10
+
+ lvx v1, r5, up
+ vaddcuw v10, a1, v1
+ vadduwm a1, a1, v1
+ vadduwm c1, c1, v10
+
+ lvx v2, r6, up
+
+ addi r8, r11, 96
+ rlwinm r3, n ,4,26,27
+ lvx v11, r3, r8
+ vand v2, v2, v11
+
+ vaddcuw v10, a2, v2
+ vadduwm a2, a2, v2
+ vadduwm c2, c2, v10
+ b L(sum)
+
+L(tail.5..8):
+ lvx v0, 0, up
+ vaddcuw v10, a0, v0
+ vadduwm a0, a0, v0
+ vadduwm c0, c0, v10
+
+ lvx v1, r5, up
+
+ addi r8, r11, 96
+ rlwinm r3, n ,4,26,27
+ lvx v11, r3, r8
+ vand v1, v1, v11
+
+ vaddcuw v10, a1, v1
+ vadduwm a1, a1, v1
+ vadduwm c1, c1, v10
+ b L(sum)
+
+L(tail.1..4):
+ lvx v0, 0, up
+
+ addi r8, r11, 96
+ rlwinm r3, n ,4,26,27
+ lvx v11, r3, r8
+ vand v0, v0, v11
+
+ vaddcuw v10, a0, v0
+ vadduwm a0, a0, v0
+ vadduwm c0, c0, v10
+
+L(sum): lvx pv, 0, r11
+ vperm x0, a0, z, pv C extract 4 24-bit field from a0
+ vperm y0, c2, z, pv
+ lvx pv, r5, r11
+ vperm x1, a1, z, pv C extract 4 24-bit field from a1
+ vperm y1, c0, z, pv C extract 4 24-bit field from a1
+ lvx pv, r6, r11
+ vperm x2, a2, z, pv C extract 4 24-bit field from a1
+ vperm y2, c1, z, pv C extract 4 24-bit field from a1
+ li r10, 48
+ lvx pv, r10, r11
+ vperm x3, a0, z, pv C extract remaining/partial a0 fields
+ vperm y3, c2, z, pv C extract remaining/partial a0 fields
+ li r10, 64
+ lvx pv, r10, r11
+ vperm x3, a1, x3, pv C insert remaining/partial a1 fields
+ vperm y3, c0, y3, pv C insert remaining/partial a1 fields
+ li r10, 80
+ lvx pv, r10, r11
+ vperm x3, a2, x3, pv C insert remaining/partial a2 fields
+ vperm y3, c1, y3, pv C insert remaining/partial a2 fields
+
+C We now have 4 128-bit accumulators to sum
+ vadduwm x0, x0, x1
+ vadduwm x2, x2, x3
+ vadduwm x0, x0, x2
+
+ vadduwm y0, y0, y1
+ vadduwm y2, y2, y3
+ vadduwm y0, y0, y2
+
+ vadduwm x0, x0, y0
+
+C Reduce 32-bit fields
+ vsumsws x0, x0, z
+
+ li r7, 16
+ stvx x0, r7, r1
+ lwz r3, 28(r1)
+
+ mtspr 256, r10
+ addi r1, r1, 32
+ blr
+EPILOGUE()
+
+C load | v0 | v1 | v2 |
+C acc | a0 | a1 | a2 |
+C carry | c0 | c1 | c2 |
+C | 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 128
+C |---|---|---|---|---|---|---|---|---|---|---|---| 32
+C | | | | | | | | | | | | | | | | | 24
+C | | | | | | | | | 48
+
+C $---------------$---------------$---------------$---------------$
+C | . . . . . . . . . . . . . . . |
+C |_______________________________________________________________|
+C | | | | | | |
+C <-hi16-> <--- 24 --> <--- 24 --> <--- 24 --> <--- 24 --> <-lo16->
+
+
+DEF_OBJECT(cnsts,16)
+C Permutation vectors in the order they are used above
+C # 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
+ .byte 0x10,0x01,0x02,0x03, 0x10,0x06,0x07,0x00, 0x10,0x0b,0x04,0x05, 0x10,0x08,0x09,0x0a C a0
+ .byte 0x10,0x07,0x00,0x01, 0x10,0x04,0x05,0x06, 0x10,0x09,0x0a,0x0b, 0x10,0x0e,0x0f,0x08 C a1
+ .byte 0x10,0x00,0x01,0x02, 0x10,0x05,0x06,0x07, 0x10,0x0a,0x0b,0x04, 0x10,0x0f,0x08,0x09 C a2
+ .byte 0x10,0x0d,0x0e,0x0f, 0x10,0x10,0x10,0x0c, 0x10,0x10,0x10,0x10, 0x10,0x10,0x10,0x10 C part a0
+ .byte 0x10,0x11,0x12,0x13, 0x10,0x02,0x03,0x17, 0x10,0x10,0x0c,0x0d, 0x10,0x10,0x10,0x10 C part a1
+ .byte 0x10,0x11,0x12,0x13, 0x10,0x15,0x16,0x17, 0x10,0x03,0x1a,0x1b, 0x10,0x0c,0x0d,0x0e C part a2
+C Masks for high end of number
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+ .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+ .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+C Masks for low end of number
+C .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+C .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+C .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+END_OBJECT(cnsts)
diff --git a/gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm b/gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm
new file mode 100644
index 0000000..943c92d
--- /dev/null
+++ b/gmp-6.3.0/mpn/powerpc32/vmx/popcount.asm
@@ -0,0 +1,34 @@
+dnl PowerPC-32/VMX mpn_popcount.
+
+dnl Copyright 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`powerpc64/vmx/popcount.asm')