aboutsummaryrefslogtreecommitdiff
path: root/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm
diff options
context:
space:
mode:
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/bd1/popcount.asm')
-rw-r--r--gmp-6.3.0/mpn/x86_64/bd1/popcount.asm191
1 files changed, 191 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm b/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm
new file mode 100644
index 0000000..7b084f4
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86_64/bd1/popcount.asm
@@ -0,0 +1,191 @@
+dnl AMD64 SSSE3/XOP mpn_popcount -- population count.
+
+dnl Copyright 2010-2017 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C cycles/limb good for cpu?
+C AMD K8,K9 n/a
+C AMD K10 n/a
+C AMD bd1 1.27 y
+C AMD bd2 1.24 y
+C AMD bd3 ?
+C AMD bd4 1.22
+C AMD zen n/a
+C AMD bobcat n/a
+C AMD jaguar n/a
+C Intel P4 n/a
+C Intel CNR n/a
+C Intel PNR n/a
+C Intel NHM n/a
+C Intel SBR n/a
+C Intel IBR n/a
+C Intel HWL n/a
+C Intel BWL n/a
+C Intel SKL n/a
+C Intel atom n/a
+C Intel SLM n/a
+C VIA nano n/a
+
+C TODO
+C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we
+C intend to support old systems.
+
+C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some
+C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX.
+C We fall back to the core2 code.
+ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',`
+MULFUNC_PROLOGUE(mpn_popcount)
+include_mpn(`x86_64/core2/popcount.asm')
+',`
+
+define(`up', `%rdi')
+define(`n', `%rsi')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_popcount)
+ FUNC_ENTRY(3)
+ lea L(cnsts)(%rip), %r9
+
+ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)',
+ `define(`OFF1',64) define(`OFF2',80) define(`OFF3',96)')
+ movdqa OFF1`'(%r9), %xmm7 C nibble counts table
+ movdqa OFF2`'(%r9), %xmm6 C splat shift counts
+ movdqa OFF3`'(%r9), %xmm9 C masks
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5 C 0-reg
+ pxor %xmm8, %xmm8 C grand total count
+
+ xor R32(%rdx), R32(%rdx)
+
+ mov R32(n), R32(%rax)
+ and $7, R32(%rax)
+ifdef(`PIC',`
+ movslq (%r9,%rax,4), %rax
+ add %r9, %rax
+ jmp *%rax
+',`
+ jmp *(%r9,%rax,8)
+')
+
+L(1): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up),%rdx
+ add $8, up
+ dec n
+ jnz L(top)
+ mov %rdx, %rax
+ FUNC_EXIT()
+ ret
+
+L(2): add $-48, up
+ jmp L(e2)
+
+L(3): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx
+ add $-40, up
+ jmp L(e2)
+
+L(4): add $-32, up
+ jmp L(e4)
+
+L(5): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx
+ add $-24, up
+ jmp L(e4)
+
+L(6): add $-16, up
+ jmp L(e6)
+
+L(7): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx
+ add $-8, up
+ jmp L(e6)
+
+ ALIGN(32)
+L(top): lddqu (up), %xmm0
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
+ pand %xmm9, %xmm0
+ pand %xmm9, %xmm1
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1, %xmm7, %xmm7, %xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e6): lddqu 16(up), %xmm0
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
+ pand %xmm9, %xmm0
+ pand %xmm9, %xmm1
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2
+ .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3
+ paddb %xmm2, %xmm3
+ paddb %xmm3, %xmm4
+L(e4): lddqu 32(up), %xmm0
+ .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1
+ pand %xmm9, %xmm0
+ pand %xmm9, %xmm1
+ .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0, %xmm7, %xmm7, %xmm2
+ .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5
+ .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4
+ paddb %xmm2, %xmm4
+L(e2): popcnt 48(up), %r8
+ popcnt 56(up), %r9
+ add $64, up
+ paddq %xmm5, %xmm8 C sum to 2 x 64-bit counts
+ add %r8, %rdx
+ add %r9, %rdx
+ sub $8, n
+ jg L(top)
+
+ .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5
+ paddq %xmm5, %xmm8
+ pshufd $14, %xmm8, %xmm0
+ paddq %xmm8, %xmm0
+ movd %xmm0, %rax
+ add %rdx, %rax
+ FUNC_EXIT()
+ ret
+EPILOGUE()
+DEF_OBJECT(L(cnsts),16,`JUMPTABSECT')
+ JMPENT( L(top), L(cnsts))
+ JMPENT( L(1), L(cnsts))
+ JMPENT( L(2), L(cnsts))
+ JMPENT( L(3), L(cnsts))
+ JMPENT( L(4), L(cnsts))
+ JMPENT( L(5), L(cnsts))
+ JMPENT( L(6), L(cnsts))
+ JMPENT( L(7), L(cnsts))
+ .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03
+ .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04
+ .byte -4,-4,-4,-4,-4,-4,-4,-4
+ .byte -4,-4,-4,-4,-4,-4,-4,-4
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+ .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+END_OBJECT(L(cnsts))
+')