diff options
Diffstat (limited to 'gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm')
-rw-r--r-- | gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm | 281 |
1 files changed, 281 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm new file mode 100644 index 0000000..c7f4426 --- /dev/null +++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm @@ -0,0 +1,281 @@ +dnl X86-32 and X86-64 mpn_popcount using SSE2. + +dnl Copyright 2006, 2007, 2011, 2015, 2020 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + + +C 32-bit popcount hamdist +C cycles/limb cycles/limb +C P5 - +C P6 model 0-8,10-12 - +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 4 +C P4 model 0 (Willamette) ? +C P4 model 1 (?) ? +C P4 model 2 (Northwood) 3.9 +C P4 model 3 (Prescott) ? +C P4 model 4 (Nocona) ? +C AMD K6 - +C AMD K7 - +C AMD K8 ? + +C 64-bit popcount hamdist +C cycles/limb cycles/limb +C P4 model 4 (Nocona): 8 +C AMD K8,K9 7.5 +C AMD K10 3.5 +C Intel core2 3.68 +C Intel corei 3.15 +C Intel atom 10.8 +C VIA nano 6.5 + +C TODO +C * Make an mpn_hamdist based on this. Alignment could either be handled by +C using movdqu for one operand and movdqa for the other, or by painfully +C shifting as we go. Unfortunately, there seem to be no usable shift +C instruction, except for one that takes an immediate count. +C * It would probably be possible to cut a few cycles/limb using software +C pipelining. +C * There are 35 decode slots unused by the SSE2 instructions. Loop control +C needs just 2 or 3 slots, leaving around 32 slots. This allows a parallel +C integer based popcount. Such a combined loop would handle 6 limbs in +C about 30 cycles on K8. +C * We could save a byte or two by using 32-bit operations on areg. +C * Check if using movdqa to a temp of and then register-based pand is faster. + +ifelse(GMP_LIMB_BITS,`32', +` define(`up', `%edx') + define(`n', `%ecx') + define(`areg',`%eax') + define(`breg',`%ebx') + define(`zero',`%xmm4') + define(`LIMB32',` $1') + define(`LIMB64',`dnl') +',` + define(`up', `%rdi') + define(`n', `%rsi') + define(`areg',`%rax') + define(`breg',`%rdx') + define(`zero',`%xmm8') + define(`LIMB32',`dnl') + define(`LIMB64',` $1') +') + +define(`mm01010101',`%xmm6') +define(`mm00110011',`%xmm7') +define(`mm00001111',`%xmm2') + +define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) +define(`LIMBS_PER_XMM', eval(16/GMP_LIMB_BYTES)) +define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES)) + +undefine(`psadbw') C override inherited m4 version + +C This file is shared between 32-bit and 64-bit builds. Only the former has +C LEAL. Default LEAL as an alias of LEA. +ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')') + +ASM_START() + +C Make cnsts global to work around Apple relocation bug. +ifdef(`DARWIN',` + define(`cnsts', MPN(popccnsts)) + GLOBL cnsts') + + TEXT + ALIGN(32) +PROLOGUE(mpn_popcount) + +LIMB32(`mov 4(%esp), up ') +LIMB32(`mov 8(%esp), n ') +LIMB32(`push %ebx ') + + pxor %xmm3, %xmm3 C zero grand total count +LIMB64(`pxor zero, zero ') + + LEAL( cnsts, breg) + + movdqa -48(breg), mm01010101 + movdqa -32(breg), mm00110011 + movdqa -16(breg), mm00001111 + + mov up, areg + and $-16, up C round `up' down to 128-bit boundary + and $12, areg C 32:areg = 0, 4, 8, 12 + C 64:areg = 0, 8 + movdqa (up), %xmm0 + pand 64(breg,areg,4), %xmm0 + shr $m4_log2(GMP_LIMB_BYTES), %eax + add areg, n C compensate n for rounded down `up' + + pxor %xmm4, %xmm4 + sub $LIMBS_PER_XMM, n + jbe L(sum) + + sub $LIMBS_PER_XMM, n + ja L(ent) + jmp L(lsum) + + ALIGN(16) +L(top): movdqa (up), %xmm0 +L(ent): movdqa 16(up), %xmm4 + + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + psrld $1, %xmm0 + psrld $1, %xmm4 + pand mm01010101, %xmm0 + pand mm01010101, %xmm4 + psubd %xmm0, %xmm1 + psubd %xmm4, %xmm5 + + movdqa %xmm1, %xmm0 + movdqa %xmm5, %xmm4 + psrlq $2, %xmm1 + psrlq $2, %xmm5 + pand mm00110011, %xmm0 + pand mm00110011, %xmm4 + pand mm00110011, %xmm1 + pand mm00110011, %xmm5 + paddq %xmm0, %xmm1 + paddq %xmm4, %xmm5 + +LIMB32(`pxor zero, zero ') + + add $32, up + sub $LIMBS_PER_2XMM, n + + paddq %xmm5, %xmm1 + movdqa %xmm1, %xmm0 + psrlq $4, %xmm1 + pand mm00001111, %xmm0 + pand mm00001111, %xmm1 + paddq %xmm0, %xmm1 + + psadbw zero, %xmm1 + paddq %xmm1, %xmm3 C add to grand total + + jnc L(top) +L(end): + add $LIMBS_PER_2XMM, n + jz L(rt) + movdqa (up), %xmm0 + pxor %xmm4, %xmm4 + sub $LIMBS_PER_XMM, n + jbe L(sum) +L(lsum): + movdqa %xmm0, %xmm4 + movdqa 16(up), %xmm0 +L(sum): + shl $m4_log2(GMP_LIMB_BYTES), n + and $12, n + pand (breg,n,4), %xmm0 + + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + psrld $1, %xmm0 + psrld $1, %xmm4 + pand mm01010101, %xmm0 + pand mm01010101, %xmm4 + psubd %xmm0, %xmm1 + psubd %xmm4, %xmm5 + + movdqa %xmm1, %xmm0 + movdqa %xmm5, %xmm4 + psrlq $2, %xmm1 + psrlq $2, %xmm5 + pand mm00110011, %xmm0 + pand mm00110011, %xmm4 + pand mm00110011, %xmm1 + pand mm00110011, %xmm5 + paddq %xmm0, %xmm1 + paddq %xmm4, %xmm5 + +LIMB32(`pxor zero, zero ') + + paddq %xmm5, %xmm1 + movdqa %xmm1, %xmm0 + psrlq $4, %xmm1 + pand mm00001111, %xmm0 + pand mm00001111, %xmm1 + paddq %xmm0, %xmm1 + + psadbw zero, %xmm1 + paddq %xmm1, %xmm3 C add to grand total + + +C Add the two 64-bit halves of the grand total counter +L(rt): movdqa %xmm3, %xmm0 + psrldq $8, %xmm3 + paddq %xmm3, %xmm0 + movd %xmm0, areg C movq avoided due to gas bug + +LIMB32(`pop %ebx ') + ret + +EPILOGUE() +DEF_OBJECT(dummy,16) +C Three magic constants used for masking out bits + .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 + .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 + + .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 + .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 + + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +cnsts: +C Masks for high end of number + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +C Masks for low end of number + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff +END_OBJECT(dummy) +ASM_END() |