From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/x86/k7/mod_1_4.asm | 260 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 gmp-6.3.0/mpn/x86/k7/mod_1_4.asm (limited to 'gmp-6.3.0/mpn/x86/k7/mod_1_4.asm') diff --git a/gmp-6.3.0/mpn/x86/k7/mod_1_4.asm b/gmp-6.3.0/mpn/x86/k7/mod_1_4.asm new file mode 100644 index 0000000..bb7597e --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/mod_1_4.asm @@ -0,0 +1,260 @@ +dnl x86-32 mpn_mod_1s_4p, requiring cmov. + +dnl Contributed to the GNU project by Torbjorn Granlund. + +dnl Copyright 2009, 2010 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C P5 ? +C P6 model 0-8,10-12 ? +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 6 +C P4 model 0 (Willamette) ? +C P4 model 1 (?) ? +C P4 model 2 (Northwood) 15.5 +C P4 model 3 (Prescott) ? +C P4 model 4 (Nocona) ? +C AMD K6 ? +C AMD K7 4.75 +C AMD K8 ? + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mod_1s_4p) + push %ebp + push %edi + push %esi + push %ebx + sub $28, %esp + mov 60(%esp), %edi C cps[] + mov 8(%edi), %eax + mov 12(%edi), %edx + mov 16(%edi), %ecx + mov 20(%edi), %esi + mov 24(%edi), %edi + mov %eax, 4(%esp) + mov %edx, 8(%esp) + mov %ecx, 12(%esp) + mov %esi, 16(%esp) + mov %edi, 20(%esp) + mov 52(%esp), %eax C n + xor %edi, %edi + mov 48(%esp), %esi C up + lea -12(%esi,%eax,4), %esi + and $3, %eax + je L(b0) + cmp $2, %eax + jc L(b1) + je L(b2) + +L(b3): mov 4(%esi), %eax + mull 4(%esp) + mov (%esi), %ebp + add %eax, %ebp + adc %edx, %edi + mov 8(%esi), %eax + mull 8(%esp) + lea -12(%esi), %esi + jmp L(m0) + +L(b0): mov (%esi), %eax + mull 4(%esp) + mov -4(%esi), %ebp + add %eax, %ebp + adc %edx, %edi + mov 4(%esi), %eax + mull 8(%esp) + add %eax, %ebp + adc %edx, %edi + mov 8(%esi), %eax + mull 12(%esp) + lea -16(%esi), %esi + jmp L(m0) + +L(b1): mov 8(%esi), %ebp + lea -4(%esi), %esi + jmp L(m1) + +L(b2): mov 8(%esi), %edi + mov 4(%esi), %ebp + lea -8(%esi), %esi + jmp L(m1) + + ALIGN(16) +L(top): mov (%esi), %eax + mull 4(%esp) + mov -4(%esi), %ebx + xor %ecx, %ecx + add %eax, %ebx + adc %edx, %ecx + mov 4(%esi), %eax + mull 8(%esp) + add %eax, %ebx + adc %edx, %ecx + mov 8(%esi), %eax + mull 12(%esp) + add %eax, %ebx + adc %edx, %ecx + lea -16(%esi), %esi + mov 16(%esp), %eax + mul %ebp + add %eax, %ebx + adc %edx, %ecx + mov 20(%esp), %eax + mul %edi + mov %ebx, %ebp + mov %ecx, %edi +L(m0): add %eax, %ebp + adc %edx, %edi +L(m1): subl $4, 52(%esp) + ja L(top) + +L(end): mov 4(%esp), %eax + mul %edi + mov 60(%esp), %edi + add %eax, %ebp + adc $0, %edx + mov 4(%edi), %ecx + mov %edx, %esi + mov %ebp, %eax + sal %cl, %esi + mov %ecx, %ebx + neg %ecx + shr %cl, %eax + or %esi, %eax + lea 1(%eax), %esi + mull (%edi) + mov %ebx, %ecx + mov %eax, %ebx + mov %ebp, %eax + mov 56(%esp), %ebp + sal %cl, %eax + add %eax, %ebx + adc %esi, %edx + imul %ebp, %edx + sub %edx, %eax + lea (%eax,%ebp), %edx + cmp %eax, %ebx + cmovc( %edx, %eax) + mov %eax, %edx + sub %ebp, %eax + cmovc( %edx, %eax) + add $28, %esp + pop %ebx + pop %esi + pop %edi + pop %ebp + shr %cl, %eax + ret +EPILOGUE() + + ALIGN(16) +PROLOGUE(mpn_mod_1s_4p_cps) +C CAUTION: This is the same code as in pentium4/sse2/mod_1_4.asm + push %ebp + push %edi + push %esi + push %ebx + mov 20(%esp), %ebp C FIXME: avoid bp for 0-idx + mov 24(%esp), %ebx + bsr %ebx, %ecx + xor $31, %ecx + sal %cl, %ebx C b << cnt + mov %ebx, %edx + not %edx + mov $-1, %eax + div %ebx + xor %edi, %edi + sub %ebx, %edi + mov $1, %esi + mov %eax, (%ebp) C store bi + mov %ecx, 4(%ebp) C store cnt + shld %cl, %eax, %esi + imul %edi, %esi + mov %eax, %edi + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 8(%ebp) C store B1modb + + not %edx + imul %ebx, %edx + lea (%edx,%ebx), %esi + cmp %edx, %eax + cmovnc( %edx, %esi) + mov %edi, %eax + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 12(%ebp) C store B2modb + + not %edx + imul %ebx, %edx + lea (%edx,%ebx), %esi + cmp %edx, %eax + cmovnc( %edx, %esi) + mov %edi, %eax + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 16(%ebp) C store B3modb + + not %edx + imul %ebx, %edx + lea (%edx,%ebx), %esi + cmp %edx, %eax + cmovnc( %edx, %esi) + mov %edi, %eax + mul %esi + + add %esi, %edx + shr %cl, %esi + mov %esi, 20(%ebp) C store B4modb + + not %edx + imul %ebx, %edx + add %edx, %ebx + cmp %edx, %eax + cmovnc( %edx, %ebx) + + shr %cl, %ebx + mov %ebx, 24(%ebp) C store B5modb + + pop %ebx + pop %esi + pop %edi + pop %ebp + ret +EPILOGUE() -- cgit v1.2.3