From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm | 645 +++++++++++++++++++++++++++ 1 file changed, 645 insertions(+) create mode 100644 gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm (limited to 'gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm') diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm new file mode 100644 index 0000000..0146fab --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/divrem_1.asm @@ -0,0 +1,645 @@ +dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division. + +dnl Copyright 1999-2004 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part. + + +C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t carry); +C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, +C mp_srcptr src, mp_size_t size, +C mp_limb_t divisor, mp_limb_t inverse, +C unsigned shift); +C +C Algorithm: +C +C The method and nomenclature follow part 8 of "Division by Invariant +C Integers using Multiplication" by Granlund and Montgomery, reference in +C gmp.texi. +C +C "m" is written for what is m' in the paper, and "d" for d_norm, which +C won't cause any confusion since it's only the normalized divisor that's of +C any use in the code. "b" is written for 2^N, the size of a limb, N being +C 32 here. +C +C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as +C "n-d - q1*d". This rearrangement gives the same two-limb answer but lets +C us have just a psubq on the dependent chain. +C +C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here, +C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much. +C +C Notes: +C +C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high +C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero +C carry, since in normal circumstances that will be a very rare event. +C +C The test for skipping a division is branch free (once size>=1 is tested). +C The store to the destination high limb is 0 when a divide is skipped, or +C if it's not skipped then a copy of the src high limb is stored. The +C latter is in case src==dst. +C +C There's a small bias towards expecting xsize==0, by having code for +C xsize==0 in a straight line and xsize!=0 under forward jumps. +C +C Enhancements: +C +C The loop measures 32 cycles, but the dependent chain would suggest it +C could be done with 30. Not sure where to start looking for the extras. +C +C Alternatives: +C +C If the divisor is normalized (high bit set) then a division step can +C always be skipped, since the high destination limb is always 0 or 1 in +C that case. It doesn't seem worth checking for this though, since it +C probably occurs infrequently. + + +dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by +dnl inverse method is used, rather than plain "divl"s. Minimum value 1. +dnl +dnl The inverse takes about 80-90 cycles to calculate, but after that the +dnl multiply is 32 c/l versus division at about 58 c/l. +dnl +dnl At 4 limbs the div is a touch faster than the mul (and of course +dnl simpler), so start the mul from 5 limbs. + +deflit(MUL_THRESHOLD, 5) + + +defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 +defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 +defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c +defframe(PARAM_DIVISOR,20) +defframe(PARAM_SIZE, 16) +defframe(PARAM_SRC, 12) +defframe(PARAM_XSIZE, 8) +defframe(PARAM_DST, 4) + +dnl re-use parameter space +define(SAVE_ESI,`PARAM_SIZE') +define(SAVE_EBP,`PARAM_SRC') +define(SAVE_EDI,`PARAM_DIVISOR') +define(SAVE_EBX,`PARAM_DST') + + TEXT + + ALIGN(16) +PROLOGUE(mpn_preinv_divrem_1) +deflit(`FRAME',0) + + movl PARAM_SIZE, %ecx + xorl %edx, %edx C carry if can't skip a div + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebp, SAVE_EBP + movl PARAM_DIVISOR, %ebp + + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + movl -4(%esi,%ecx,4), %eax C src high limb + + movl %ebx, SAVE_EBX + movl PARAM_XSIZE, %ebx + + movd PARAM_PREINV_INVERSE, %mm4 + + movd PARAM_PREINV_SHIFT, %mm7 C l + cmpl %ebp, %eax C high cmp divisor + + cmovc( %eax, %edx) C high is carry if high new n2 + psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 + + ASSERT(be,`C 0 or -1 + movd %mm3, %eax + addl $1, %eax + cmpl $1, %eax') + + paddd %mm3, %mm2 C q + pand %mm5, %mm3 C mask & d + + paddd %mm3, %mm0 C addback if necessary + movd %mm2, (%edi) + leal -4(%edi), %edi + + subl $1, %ecx + ja L(integer_top) + + +L(integer_last): + C eax + C ebx xsize + C ecx + C edx + C esi &src[0] + C edi &dst[xsize] + C + C mm0 n2 + C mm4 m + C mm5 d + C mm6 + C mm7 l + + ASSERT(b,`C n2 n2 + psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 + + ASSERT(be,`C 0 or -1 + movd %mm3, %eax + addl $1, %eax + cmpl $1, %eax') + + paddd %mm3, %mm2 C q + pand %mm5, %mm3 C mask & d + + paddd %mm3, %mm0 C addback if necessary + movd %mm2, (%edi) + leal -4(%edi), %edi + + +L(integer_none): + C eax + C ebx xsize + + orl %ebx, %ebx + jnz L(fraction_some) C if xsize!=0 + + +L(fraction_done): + movl SAVE_EBP, %ebp + psrld %mm7, %mm0 C remainder + + movl SAVE_EDI, %edi + movd %mm0, %eax + + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + emms + ret + + + +C ----------------------------------------------------------------------------- +C + +L(fraction_some): + C eax + C ebx xsize + C ecx + C edx + C esi + C edi &dst[xsize-1] + C ebp + + +L(fraction_top): + C eax + C ebx counter, xsize iterations + C ecx + C edx + C esi src, decrementing + C edi dst, decrementing + C + C mm0 n2 + C mm4 m + C mm5 d + C mm6 32-l + C mm7 l + + ASSERT(b,`C n2 n2 + psrlq $32, %mm1 C high n - (q1+1)*d, 0 or -1 + + ASSERT(be,`C 0 or -1 + movd %mm1, %eax + addl $1, %eax + cmpl $1, %eax') + + paddd %mm1, %mm2 C q + pand %mm5, %mm1 C mask & d + + paddd %mm1, %mm0 C addback if necessary + movd %mm2, (%edi) + leal -4(%edi), %edi + + subl $1, %ebx + jne L(fraction_top) + + + jmp L(fraction_done) + +EPILOGUE() -- cgit v1.2.3