aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/pa32/hppa1_1/pa7100')
-rw-r--r--gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/add_n.asm83
-rw-r--r--gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/addmul_1.asm201
-rw-r--r--gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/lshift.asm95
-rw-r--r--gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/rshift.asm92
-rw-r--r--gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/sub_n.asm84
-rw-r--r--gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/submul_1.asm207
6 files changed, 762 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/add_n.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/add_n.asm
new file mode 100644
index 0000000..b96d403
--- /dev/null
+++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/add_n.asm
@@ -0,0 +1,83 @@
+dnl HP-PA mpn_add_n -- Add two limb vectors of the same length > 0 and store
+dnl sum in a third limb vector. Optimized for the PA7100, where is runs at
+dnl 4.25 cycles/limb.
+
+dnl Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr r26
+C s1_ptr r25
+C s2_ptr r24
+C size r23
+
+ASM_START()
+PROLOGUE(mpn_add_n)
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+
+ addib,<= -5,%r23,L(rest)
+ add %r20,%r19,%r28 C add first limbs ignoring cy
+
+LDEF(loop)
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addc %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addc %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addc %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,> -4,%r23,L(loop)
+ addc %r20,%r19,%r28
+
+LDEF(rest)
+ addib,= 4,%r23,L(end)
+ nop
+
+LDEF(eloop)
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,> -1,%r23,L(eloop)
+ addc %r20,%r19,%r28
+
+LDEF(end)
+ stws %r28,0(0,%r26)
+ bv 0(%r2)
+ addc %r0,%r0,%r28
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/addmul_1.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/addmul_1.asm
new file mode 100644
index 0000000..fb16100
--- /dev/null
+++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/addmul_1.asm
@@ -0,0 +1,201 @@
+dnl HP-PA 7100/7200 mpn_addmul_1 -- Multiply a limb vector with a limb and
+dnl add the result to a second limb vector.
+
+dnl Copyright 1995, 2000-2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`res_ptr',`%r26')
+define(`s1_ptr',`%r25')
+define(`size_param',`%r24')
+define(`s2_limb',`%r23')
+
+define(`cylimb',`%r28')
+define(`s0',`%r19')
+define(`s1',`%r20')
+define(`s2',`%r3')
+define(`s3',`%r4')
+define(`lo0',`%r21')
+define(`lo1',`%r5')
+define(`lo2',`%r6')
+define(`lo3',`%r7')
+define(`hi0',`%r22')
+define(`hi1',`%r23') C safe to reuse
+define(`hi2',`%r29')
+define(`hi3',`%r1')
+
+ASM_START()
+PROLOGUE(mpn_addmul_1)
+C .callinfo frame=128,no_calls
+
+ ldo 128(%r30),%r30
+ stws s2_limb,-16(%r30)
+ add %r0,%r0,cylimb C clear cy and cylimb
+ addib,< -4,size_param,L(few_limbs)
+ fldws -16(%r30),%fr31R
+
+ ldo -112(%r30),%r31
+ stw %r3,-96(%r30)
+ stw %r4,-92(%r30)
+ stw %r5,-88(%r30)
+ stw %r6,-84(%r30)
+ stw %r7,-80(%r30)
+
+ bb,>=,n s1_ptr,29,L(0)
+
+ fldws,ma 4(s1_ptr),%fr4
+ ldws 0(res_ptr),s0
+ xmpyu %fr4,%fr31R,%fr5
+ fstds %fr5,-16(%r31)
+ ldws -16(%r31),cylimb
+ ldws -12(%r31),lo0
+ add s0,lo0,s0
+ addib,< -1,size_param,L(few_limbs)
+ stws,ma s0,4(res_ptr)
+
+C start software pipeline ----------------------------------------------------
+LDEF(0)
+ fldds,ma 8(s1_ptr),%fr4
+ fldds,ma 8(s1_ptr),%fr8
+
+ xmpyu %fr4L,%fr31R,%fr5
+ xmpyu %fr4R,%fr31R,%fr6
+ xmpyu %fr8L,%fr31R,%fr9
+ xmpyu %fr8R,%fr31R,%fr10
+
+ fstds %fr5,-16(%r31)
+ fstds %fr6,-8(%r31)
+ fstds %fr9,0(%r31)
+ fstds %fr10,8(%r31)
+
+ ldws -16(%r31),hi0
+ ldws -12(%r31),lo0
+ ldws -8(%r31),hi1
+ ldws -4(%r31),lo1
+ ldws 0(%r31),hi2
+ ldws 4(%r31),lo2
+ ldws 8(%r31),hi3
+ ldws 12(%r31),lo3
+
+ addc lo0,cylimb,lo0
+ addc lo1,hi0,lo1
+ addc lo2,hi1,lo2
+ addc lo3,hi2,lo3
+
+ addib,< -4,size_param,L(end)
+ addc %r0,hi3,cylimb C propagate carry into cylimb
+C main loop ------------------------------------------------------------------
+LDEF(loop)
+ fldds,ma 8(s1_ptr),%fr4
+ fldds,ma 8(s1_ptr),%fr8
+
+ ldws 0(res_ptr),s0
+ xmpyu %fr4L,%fr31R,%fr5
+ ldws 4(res_ptr),s1
+ xmpyu %fr4R,%fr31R,%fr6
+ ldws 8(res_ptr),s2
+ xmpyu %fr8L,%fr31R,%fr9
+ ldws 12(res_ptr),s3
+ xmpyu %fr8R,%fr31R,%fr10
+
+ fstds %fr5,-16(%r31)
+ add s0,lo0,s0
+ fstds %fr6,-8(%r31)
+ addc s1,lo1,s1
+ fstds %fr9,0(%r31)
+ addc s2,lo2,s2
+ fstds %fr10,8(%r31)
+ addc s3,lo3,s3
+
+ ldws -16(%r31),hi0
+ ldws -12(%r31),lo0
+ ldws -8(%r31),hi1
+ ldws -4(%r31),lo1
+ ldws 0(%r31),hi2
+ ldws 4(%r31),lo2
+ ldws 8(%r31),hi3
+ ldws 12(%r31),lo3
+
+ addc lo0,cylimb,lo0
+ stws,ma s0,4(res_ptr)
+ addc lo1,hi0,lo1
+ stws,ma s1,4(res_ptr)
+ addc lo2,hi1,lo2
+ stws,ma s2,4(res_ptr)
+ addc lo3,hi2,lo3
+ stws,ma s3,4(res_ptr)
+
+ addib,>= -4,size_param,L(loop)
+ addc %r0,hi3,cylimb C propagate carry into cylimb
+C finish software pipeline ---------------------------------------------------
+LDEF(end)
+ ldws 0(res_ptr),s0
+ ldws 4(res_ptr),s1
+ ldws 8(res_ptr),s2
+ ldws 12(res_ptr),s3
+
+ add s0,lo0,s0
+ stws,ma s0,4(res_ptr)
+ addc s1,lo1,s1
+ stws,ma s1,4(res_ptr)
+ addc s2,lo2,s2
+ stws,ma s2,4(res_ptr)
+ addc s3,lo3,s3
+ stws,ma s3,4(res_ptr)
+
+C restore callee-saves registers ---------------------------------------------
+ ldw -96(%r30),%r3
+ ldw -92(%r30),%r4
+ ldw -88(%r30),%r5
+ ldw -84(%r30),%r6
+ ldw -80(%r30),%r7
+
+LDEF(few_limbs)
+ addib,=,n 4,size_param,L(ret)
+
+LDEF(loop2)
+ fldws,ma 4(s1_ptr),%fr4
+ ldws 0(res_ptr),s0
+ xmpyu %fr4,%fr31R,%fr5
+ fstds %fr5,-16(%r30)
+ ldws -16(%r30),hi0
+ ldws -12(%r30),lo0
+ addc lo0,cylimb,lo0
+ addc %r0,hi0,cylimb
+ add s0,lo0,s0
+ stws,ma s0,4(res_ptr)
+ addib,<> -1,size_param,L(loop2)
+ nop
+
+LDEF(ret)
+ addc %r0,cylimb,cylimb
+ bv 0(%r2)
+ ldo -128(%r30),%r30
+EPILOGUE(mpn_addmul_1)
diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/lshift.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/lshift.asm
new file mode 100644
index 0000000..d65db2a
--- /dev/null
+++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/lshift.asm
@@ -0,0 +1,95 @@
+dnl HP-PA mpn_lshift -- Shift a number left.
+dnl Optimized for the PA7100, where is runs at 3.25 cycles/limb.
+
+dnl Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr r26
+C s_ptr r25
+C size r24
+C cnt r23
+
+ASM_START()
+PROLOGUE(mpn_lshift)
+ sh2add %r24,%r25,%r25
+ sh2add %r24,%r26,%r26
+ ldws,mb -4(0,%r25),%r22
+ subi 32,%r23,%r1
+ mtsar %r1
+ addib,= -1,%r24,L(0004)
+ vshd %r0,%r22,%r28 C compute carry out limb
+ ldws,mb -4(0,%r25),%r29
+ addib,<= -5,%r24,L(rest)
+ vshd %r22,%r29,%r20
+
+LDEF(loop)
+ ldws,mb -4(0,%r25),%r22
+ stws,mb %r20,-4(0,%r26)
+ vshd %r29,%r22,%r20
+ ldws,mb -4(0,%r25),%r29
+ stws,mb %r20,-4(0,%r26)
+ vshd %r22,%r29,%r20
+ ldws,mb -4(0,%r25),%r22
+ stws,mb %r20,-4(0,%r26)
+ vshd %r29,%r22,%r20
+ ldws,mb -4(0,%r25),%r29
+ stws,mb %r20,-4(0,%r26)
+ addib,> -4,%r24,L(loop)
+ vshd %r22,%r29,%r20
+
+LDEF(rest)
+ addib,= 4,%r24,L(end1)
+ nop
+
+LDEF(eloop)
+ ldws,mb -4(0,%r25),%r22
+ stws,mb %r20,-4(0,%r26)
+ addib,<= -1,%r24,L(end2)
+ vshd %r29,%r22,%r20
+ ldws,mb -4(0,%r25),%r29
+ stws,mb %r20,-4(0,%r26)
+ addib,> -1,%r24,L(eloop)
+ vshd %r22,%r29,%r20
+
+LDEF(end1)
+ stws,mb %r20,-4(0,%r26)
+ vshd %r29,%r0,%r20
+ bv 0(%r2)
+ stw %r20,-4(0,%r26)
+
+LDEF(end2)
+ stws,mb %r20,-4(0,%r26)
+
+LDEF(0004)
+ vshd %r22,%r0,%r20
+ bv 0(%r2)
+ stw %r20,-4(0,%r26)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/rshift.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/rshift.asm
new file mode 100644
index 0000000..f7896fc
--- /dev/null
+++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/rshift.asm
@@ -0,0 +1,92 @@
+dnl HP-PA mpn_rshift -- Shift a number right.
+dnl Optimized for the PA7100, where is runs at 3.25 cycles/limb.
+
+dnl Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr r26
+C s_ptr r25
+C size r24
+C cnt r23
+
+ASM_START()
+PROLOGUE(mpn_rshift)
+ ldws,ma 4(0,%r25),%r22
+ mtsar %r23
+ addib,= -1,%r24,L(0004)
+ vshd %r22,%r0,%r28 C compute carry out limb
+ ldws,ma 4(0,%r25),%r29
+ addib,<= -5,%r24,L(rest)
+ vshd %r29,%r22,%r20
+
+LDEF(loop)
+ ldws,ma 4(0,%r25),%r22
+ stws,ma %r20,4(0,%r26)
+ vshd %r22,%r29,%r20
+ ldws,ma 4(0,%r25),%r29
+ stws,ma %r20,4(0,%r26)
+ vshd %r29,%r22,%r20
+ ldws,ma 4(0,%r25),%r22
+ stws,ma %r20,4(0,%r26)
+ vshd %r22,%r29,%r20
+ ldws,ma 4(0,%r25),%r29
+ stws,ma %r20,4(0,%r26)
+ addib,> -4,%r24,L(loop)
+ vshd %r29,%r22,%r20
+
+LDEF(rest)
+ addib,= 4,%r24,L(end1)
+ nop
+
+LDEF(eloop)
+ ldws,ma 4(0,%r25),%r22
+ stws,ma %r20,4(0,%r26)
+ addib,<= -1,%r24,L(end2)
+ vshd %r22,%r29,%r20
+ ldws,ma 4(0,%r25),%r29
+ stws,ma %r20,4(0,%r26)
+ addib,> -1,%r24,L(eloop)
+ vshd %r29,%r22,%r20
+
+LDEF(end1)
+ stws,ma %r20,4(0,%r26)
+ vshd %r0,%r29,%r20
+ bv 0(%r2)
+ stw %r20,0(0,%r26)
+
+LDEF(end2)
+ stws,ma %r20,4(0,%r26)
+
+LDEF(0004)
+ vshd %r0,%r22,%r20
+ bv 0(%r2)
+ stw %r20,0(0,%r26)
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/sub_n.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/sub_n.asm
new file mode 100644
index 0000000..df3f6e8
--- /dev/null
+++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/sub_n.asm
@@ -0,0 +1,84 @@
+dnl HP-PA mpn_sub_n -- Subtract two limb vectors of the same length > 0 and
+dnl store difference in a third limb vector. Optimized for the PA7100, where
+dnl is runs at 4.25 cycles/limb.
+
+dnl Copyright 1992, 1994, 2000-2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C res_ptr r26
+C s1_ptr r25
+C s2_ptr r24
+C size r23
+
+ASM_START()
+PROLOGUE(mpn_sub_n)
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+
+ addib,<= -5,%r23,L(rest)
+ sub %r20,%r19,%r28 C subtract first limbs ignoring cy
+
+LDEF(loop)
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ subb %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ subb %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ subb %r20,%r19,%r28
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,> -4,%r23,L(loop)
+ subb %r20,%r19,%r28
+
+LDEF(rest)
+ addib,= 4,%r23,L(end)
+ nop
+
+LDEF(eloop)
+ ldws,ma 4(0,%r25),%r20
+ ldws,ma 4(0,%r24),%r19
+ stws,ma %r28,4(0,%r26)
+ addib,> -1,%r23,L(eloop)
+ subb %r20,%r19,%r28
+
+LDEF(end)
+ stws %r28,0(0,%r26)
+ addc %r0,%r0,%r28
+ bv 0(%r2)
+ subi 1,%r28,%r28
+EPILOGUE()
diff --git a/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/submul_1.asm b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/submul_1.asm
new file mode 100644
index 0000000..5ea08cb
--- /dev/null
+++ b/gmp-6.3.0/mpn/pa32/hppa1_1/pa7100/submul_1.asm
@@ -0,0 +1,207 @@
+dnl HP-PA 7100/7200 mpn_submul_1 -- Multiply a limb vector with a limb and
+dnl subtract the result from a second limb vector.
+
+dnl Copyright 1995, 2000-2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+define(`res_ptr',`%r26')
+define(`s1_ptr',`%r25')
+define(`size_param',`%r24')
+define(`s2_limb',`%r23')
+
+define(`cylimb',`%r28')
+define(`s0',`%r19')
+define(`s1',`%r20')
+define(`s2',`%r3')
+define(`s3',`%r4')
+define(`lo0',`%r21')
+define(`lo1',`%r5')
+define(`lo2',`%r6')
+define(`lo3',`%r7')
+define(`hi0',`%r22')
+define(`hi1',`%r23') C safe to reuse
+define(`hi2',`%r29')
+define(`hi3',`%r1')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+C .callinfo frame=128,no_calls
+
+ ldo 128(%r30),%r30
+ stws s2_limb,-16(%r30)
+ add %r0,%r0,cylimb C clear cy and cylimb
+ addib,< -4,size_param,L(few_limbs)
+ fldws -16(%r30),%fr31R
+
+ ldo -112(%r30),%r31
+ stw %r3,-96(%r30)
+ stw %r4,-92(%r30)
+ stw %r5,-88(%r30)
+ stw %r6,-84(%r30)
+ stw %r7,-80(%r30)
+
+ bb,>=,n s1_ptr,29,L(0)
+
+ fldws,ma 4(s1_ptr),%fr4
+ ldws 0(res_ptr),s0
+ xmpyu %fr4,%fr31R,%fr5
+ fstds %fr5,-16(%r31)
+ ldws -16(%r31),cylimb
+ ldws -12(%r31),lo0
+ sub s0,lo0,s0
+ add s0,lo0,%r0 C invert cy
+ addib,< -1,size_param,L(few_limbs)
+ stws,ma s0,4(res_ptr)
+
+C start software pipeline ----------------------------------------------------
+LDEF(0)
+ fldds,ma 8(s1_ptr),%fr4
+ fldds,ma 8(s1_ptr),%fr8
+
+ xmpyu %fr4L,%fr31R,%fr5
+ xmpyu %fr4R,%fr31R,%fr6
+ xmpyu %fr8L,%fr31R,%fr9
+ xmpyu %fr8R,%fr31R,%fr10
+
+ fstds %fr5,-16(%r31)
+ fstds %fr6,-8(%r31)
+ fstds %fr9,0(%r31)
+ fstds %fr10,8(%r31)
+
+ ldws -16(%r31),hi0
+ ldws -12(%r31),lo0
+ ldws -8(%r31),hi1
+ ldws -4(%r31),lo1
+ ldws 0(%r31),hi2
+ ldws 4(%r31),lo2
+ ldws 8(%r31),hi3
+ ldws 12(%r31),lo3
+
+ addc lo0,cylimb,lo0
+ addc lo1,hi0,lo1
+ addc lo2,hi1,lo2
+ addc lo3,hi2,lo3
+
+ addib,< -4,size_param,L(end)
+ addc %r0,hi3,cylimb C propagate carry into cylimb
+C main loop ------------------------------------------------------------------
+LDEF(loop)
+ fldds,ma 8(s1_ptr),%fr4
+ fldds,ma 8(s1_ptr),%fr8
+
+ ldws 0(res_ptr),s0
+ xmpyu %fr4L,%fr31R,%fr5
+ ldws 4(res_ptr),s1
+ xmpyu %fr4R,%fr31R,%fr6
+ ldws 8(res_ptr),s2
+ xmpyu %fr8L,%fr31R,%fr9
+ ldws 12(res_ptr),s3
+ xmpyu %fr8R,%fr31R,%fr10
+
+ fstds %fr5,-16(%r31)
+ sub s0,lo0,s0
+ fstds %fr6,-8(%r31)
+ subb s1,lo1,s1
+ fstds %fr9,0(%r31)
+ subb s2,lo2,s2
+ fstds %fr10,8(%r31)
+ subb s3,lo3,s3
+ subb %r0,%r0,lo0 C these two insns ...
+ add lo0,lo0,%r0 C ... just invert cy
+
+ ldws -16(%r31),hi0
+ ldws -12(%r31),lo0
+ ldws -8(%r31),hi1
+ ldws -4(%r31),lo1
+ ldws 0(%r31),hi2
+ ldws 4(%r31),lo2
+ ldws 8(%r31),hi3
+ ldws 12(%r31),lo3
+
+ addc lo0,cylimb,lo0
+ stws,ma s0,4(res_ptr)
+ addc lo1,hi0,lo1
+ stws,ma s1,4(res_ptr)
+ addc lo2,hi1,lo2
+ stws,ma s2,4(res_ptr)
+ addc lo3,hi2,lo3
+ stws,ma s3,4(res_ptr)
+
+ addib,>= -4,size_param,L(loop)
+ addc %r0,hi3,cylimb C propagate carry into cylimb
+C finish software pipeline ---------------------------------------------------
+LDEF(end)
+ ldws 0(res_ptr),s0
+ ldws 4(res_ptr),s1
+ ldws 8(res_ptr),s2
+ ldws 12(res_ptr),s3
+
+ sub s0,lo0,s0
+ stws,ma s0,4(res_ptr)
+ subb s1,lo1,s1
+ stws,ma s1,4(res_ptr)
+ subb s2,lo2,s2
+ stws,ma s2,4(res_ptr)
+ subb s3,lo3,s3
+ stws,ma s3,4(res_ptr)
+ subb %r0,%r0,lo0 C these two insns ...
+ add lo0,lo0,%r0 C ... invert cy
+
+C restore callee-saves registers ---------------------------------------------
+ ldw -96(%r30),%r3
+ ldw -92(%r30),%r4
+ ldw -88(%r30),%r5
+ ldw -84(%r30),%r6
+ ldw -80(%r30),%r7
+
+LDEF(few_limbs)
+ addib,=,n 4,size_param,L(ret)
+
+LDEF(loop2)
+ fldws,ma 4(s1_ptr),%fr4
+ ldws 0(res_ptr),s0
+ xmpyu %fr4,%fr31R,%fr5
+ fstds %fr5,-16(%r30)
+ ldws -16(%r30),hi0
+ ldws -12(%r30),lo0
+ addc lo0,cylimb,lo0
+ addc %r0,hi0,cylimb
+ sub s0,lo0,s0
+ add s0,lo0,%r0 C invert cy
+ stws,ma s0,4(res_ptr)
+ addib,<> -1,size_param,L(loop2)
+ nop
+
+LDEF(ret)
+ addc %r0,cylimb,cylimb
+ bv 0(%r2)
+ ldo -128(%r30),%r30
+EPILOGUE(mpn_submul_1)