From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/x86/k7/mod_1_1.asm | 221 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100644 gmp-6.3.0/mpn/x86/k7/mod_1_1.asm (limited to 'gmp-6.3.0/mpn/x86/k7/mod_1_1.asm') diff --git a/gmp-6.3.0/mpn/x86/k7/mod_1_1.asm b/gmp-6.3.0/mpn/x86/k7/mod_1_1.asm new file mode 100644 index 0000000..1bbe6f9 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/k7/mod_1_1.asm @@ -0,0 +1,221 @@ +dnl x86-32 mpn_mod_1_1p, requiring cmov. + +dnl Contributed to the GNU project by Niels Möller and Torbjorn Granlund. + +dnl Copyright 2010, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C P5 ? +C P6 model 0-8,10-12 ? +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) ? +C P4 model 0 (Willamette) ? +C P4 model 1 (?) ? +C P4 model 2 (Northwood) ? +C P4 model 3 (Prescott) ? +C P4 model 4 (Nocona) ? +C AMD K6 ? +C AMD K7 7 +C AMD K8 ? + +define(`B2mb', `%ebx') +define(`r0', `%esi') +define(`r2', `%ebp') +define(`t0', `%edi') +define(`ap', `%ecx') C Also shift count + +C Stack frame +C pre 36(%esp) +C b 32(%esp) +C n 28(%esp) +C ap 24(%esp) +C return 20(%esp) +C %ebp 16(%esp) +C %edi 12(%esp) +C %esi 8(%esp) +C %ebx 4(%esp) +C B2mod (%esp) + +define(`B2modb', `(%esp)') +define(`n', `28(%esp)') +define(`b', `32(%esp)') +define(`pre', `36(%esp)') + +C mp_limb_t +C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4]) +C +C The pre array contains bi, cnt, B1modb, B2modb +C Note: This implementation needs B1modb only when cnt > 0 + +ASM_START() + TEXT + ALIGN(8) +PROLOGUE(mpn_mod_1_1p) + push %ebp + push %edi + push %esi + push %ebx + mov 32(%esp), %ebp C pre[] + + mov 12(%ebp), %eax C B2modb + push %eax C Put it on stack + + mov n, %edx + mov 24(%esp), ap + + lea (ap, %edx, 4), ap + mov -4(ap), %eax + cmp $3, %edx + jnc L(first) + mov -8(ap), r0 + jmp L(reduce_two) + +L(first): + C First iteration, no r2 + mull B2modb + mov -12(ap), r0 + add %eax, r0 + mov -8(ap), %eax + adc %edx, %eax + sbb r2, r2 + subl $3, n + lea -16(ap), ap + jz L(reduce_three) + + mov B2modb, B2mb + sub b, B2mb + lea (B2mb, r0), t0 + jmp L(mid) + + ALIGN(16) +L(top): C Loopmixed to 7 c/l on k7 + add %eax, r0 + lea (B2mb, r0), t0 + mov r2, %eax + adc %edx, %eax + sbb r2, r2 +L(mid): mull B2modb + and B2modb, r2 + add r0, r2 + decl n + mov (ap), r0 + cmovc( t0, r2) + lea -4(ap), ap + jnz L(top) + + add %eax, r0 + mov r2, %eax + adc %edx, %eax + sbb r2, r2 + +L(reduce_three): + C Eliminate r2 + and b, r2 + sub r2, %eax + +L(reduce_two): + mov pre, %ebp + movb 4(%ebp), %cl + test %cl, %cl + jz L(normalized) + + C Unnormalized, use B1modb to reduce to size < B b + mull 8(%ebp) + xor t0, t0 + add %eax, r0 + adc %edx, t0 + mov t0, %eax + + C Left-shift to normalize + shld %cl, r0, %eax C Always use shld? + + shl %cl, r0 + jmp L(udiv) + +L(normalized): + mov %eax, t0 + sub b, t0 + cmovnc( t0, %eax) + +L(udiv): + lea 1(%eax), t0 + mull (%ebp) + mov b, %ebx C Needed in register for lea + add r0, %eax + adc t0, %edx + imul %ebx, %edx + sub %edx, r0 + cmp r0, %eax + lea (%ebx, r0), %eax + cmovnc( r0, %eax) + cmp %ebx, %eax + jnc L(fix) +L(ok): shr %cl, %eax + + add $4, %esp + pop %ebx + pop %esi + pop %edi + pop %ebp + + ret +L(fix): sub %ebx, %eax + jmp L(ok) +EPILOGUE() + +PROLOGUE(mpn_mod_1_1p_cps) + push %ebp + mov 12(%esp), %ebp + push %esi + bsr %ebp, %ecx + push %ebx + xor $31, %ecx + mov 16(%esp), %esi + sal %cl, %ebp + mov %ebp, %edx + not %edx + mov $-1, %eax + div %ebp C On K7, invert_limb would be a few cycles faster. + mov %eax, (%esi) C store bi + mov %ecx, 4(%esi) C store cnt + neg %ebp + mov $1, %edx + shld %cl, %eax, %edx + imul %ebp, %edx + shr %cl, %edx + imul %ebp, %eax + mov %edx, 8(%esi) C store B1modb + mov %eax, 12(%esi) C store B2modb + pop %ebx + pop %esi + pop %ebp + ret +EPILOGUE() -- cgit v1.2.3