From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm | 287 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 287 insertions(+) create mode 100644 gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm (limited to 'gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm') diff --git a/gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm b/gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm new file mode 100644 index 0000000..a0a9d90 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/p6/bdiv_q_1.asm @@ -0,0 +1,287 @@ +dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder. + +dnl Rearranged from mpn/x86/p6/dive_1.asm by Marco Bodrato. + +dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + +include(`../config.m4') + + +C odd even divisor +C P6: 10.0 12.0 cycles/limb + +C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1) + +C The odd case is basically the same as mpn_modexact_1_odd, just with an +C extra store, and it runs at the same 10 cycles which is the dependent +C chain. +C +C The shifts for the even case aren't on the dependent chain so in principle +C it could run the same too, but nothing running at 10 has been found. +C Perhaps there's too many uops (an extra 4 over the odd case). + +defframe(PARAM_SHIFT, 24) +defframe(PARAM_INVERSE,20) +defframe(PARAM_DIVISOR,16) +defframe(PARAM_SIZE, 12) +defframe(PARAM_SRC, 8) +defframe(PARAM_DST, 4) + +defframe(SAVE_EBX, -4) +defframe(SAVE_ESI, -8) +defframe(SAVE_EDI, -12) +defframe(SAVE_EBP, -16) +deflit(STACK_SPACE, 16) + +dnl re-use parameter space +define(VAR_INVERSE,`PARAM_SRC') + + TEXT + +C mp_limb_t +C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t inverse, int shift) + + ALIGN(16) +PROLOGUE(mpn_pi1_bdiv_q_1) +deflit(`FRAME',0) + + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebx, SAVE_EBX + movl PARAM_SIZE, %ebx + + movl %ebp, SAVE_EBP + movl PARAM_INVERSE, %ebp + + movl PARAM_SHIFT, %ecx C trailing twos + +L(common): + movl %edi, SAVE_EDI + movl PARAM_DST, %edi + + leal (%esi,%ebx,4), %esi C src end + + leal (%edi,%ebx,4), %edi C dst end + negl %ebx C -size + + movl (%esi,%ebx,4), %eax C src[0] + + orl %ecx, %ecx + jz L(odd_entry) + + movl %edi, PARAM_DST + movl %ebp, VAR_INVERSE + +L(even): + C eax src[0] + C ebx counter, limbs, negative + C ecx shift + C edx + C esi + C edi + C ebp + + xorl %ebp, %ebp C initial carry bit + xorl %edx, %edx C initial carry limb (for size==1) + + incl %ebx + jz L(even_one) + + movl (%esi,%ebx,4), %edi C src[1] + + shrdl( %cl, %edi, %eax) + + jmp L(even_entry) + + +L(even_top): + C eax scratch + C ebx counter, limbs, negative + C ecx shift + C edx scratch + C esi &src[size] + C edi &dst[size] and scratch + C ebp carry bit + + movl (%esi,%ebx,4), %edi + + mull PARAM_DIVISOR + + movl -4(%esi,%ebx,4), %eax + shrdl( %cl, %edi, %eax) + + subl %ebp, %eax + + sbbl %ebp, %ebp + subl %edx, %eax + + sbbl $0, %ebp + +L(even_entry): + imull VAR_INVERSE, %eax + + movl PARAM_DST, %edi + negl %ebp + + movl %eax, -4(%edi,%ebx,4) + incl %ebx + jnz L(even_top) + + mull PARAM_DIVISOR + + movl -4(%esi), %eax + +L(even_one): + shrl %cl, %eax + movl SAVE_ESI, %esi + + subl %ebp, %eax + movl SAVE_EBP, %ebp + + subl %edx, %eax + movl SAVE_EBX, %ebx + + imull VAR_INVERSE, %eax + + movl %eax, -4(%edi) + movl SAVE_EDI, %edi + addl $STACK_SPACE, %esp + + ret + +C The dependent chain here is +C +C subl %edx, %eax 1 +C imull %ebp, %eax 4 +C mull PARAM_DIVISOR 5 +C ---- +C total 10 +C +C and this is the measured speed. No special scheduling is necessary, out +C of order execution hides the load latency. + +L(odd_top): + C eax scratch (src limb) + C ebx counter, limbs, negative + C ecx carry bit + C edx carry limb, high of last product + C esi &src[size] + C edi &dst[size] + C ebp inverse + + mull PARAM_DIVISOR + + movl (%esi,%ebx,4), %eax + subl %ecx, %eax + + sbbl %ecx, %ecx + subl %edx, %eax + + sbbl $0, %ecx + +L(odd_entry): + imull %ebp, %eax + + movl %eax, (%edi,%ebx,4) + negl %ecx + + incl %ebx + jnz L(odd_top) + + + movl SAVE_ESI, %esi + + movl SAVE_EDI, %edi + + movl SAVE_EBP, %ebp + + movl SAVE_EBX, %ebx + addl $STACK_SPACE, %esp + + ret + +EPILOGUE() + +C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, +C mp_limb_t divisor); +C + + ALIGN(16) +PROLOGUE(mpn_bdiv_q_1) +deflit(`FRAME',0) + + movl PARAM_DIVISOR, %eax + subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE) + + movl %esi, SAVE_ESI + movl PARAM_SRC, %esi + + movl %ebx, SAVE_EBX + movl PARAM_SIZE, %ebx + + bsfl %eax, %ecx C trailing twos + + movl %ebp, SAVE_EBP + + shrl %cl, %eax C d without twos + + movl %eax, %edx + shrl %eax C d/2 without twos + + movl %edx, PARAM_DIVISOR + andl $127, %eax + +ifdef(`PIC',` + LEA( binvert_limb_table, %ebp) + movzbl (%eax,%ebp), %ebp C inv 8 bits +',` + movzbl binvert_limb_table(%eax), %ebp C inv 8 bits +') + + leal (%ebp,%ebp), %eax C 2*inv + + imull %ebp, %ebp C inv*inv + imull %edx, %ebp C inv*inv*d + + subl %ebp, %eax C inv = 2*inv - inv*inv*d + leal (%eax,%eax), %ebp C 2*inv + + imull %eax, %eax C inv*inv + imull %edx, %eax C inv*inv*d + + subl %eax, %ebp C inv = 2*inv - inv*inv*d + + jmp L(common) + +EPILOGUE() +ASM_END() -- cgit v1.2.3