diff options
author | Duncan Wilkie <antigravityd@gmail.com> | 2023-11-18 06:11:09 -0600 |
---|---|---|
committer | Duncan Wilkie <antigravityd@gmail.com> | 2023-11-18 06:11:09 -0600 |
commit | 11da511c784eca003deb90c23570f0873954e0de (patch) | |
tree | e14fdd3d5d6345956d67e79ae771d0633d28362b /gmp-6.3.0/mpn/x86_64/core2/hamdist.asm |
Initial commit.
Diffstat (limited to 'gmp-6.3.0/mpn/x86_64/core2/hamdist.asm')
-rw-r--r-- | gmp-6.3.0/mpn/x86_64/core2/hamdist.asm | 210 |
1 files changed, 210 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm b/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm new file mode 100644 index 0000000..ded7b67 --- /dev/null +++ b/gmp-6.3.0/mpn/x86_64/core2/hamdist.asm @@ -0,0 +1,210 @@ +dnl AMD64 SSSE3 mpn_hamdist -- hamming distance. + +dnl Copyright 2010-2017 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of either: +dnl +dnl * the GNU Lesser General Public License as published by the Free +dnl Software Foundation; either version 3 of the License, or (at your +dnl option) any later version. +dnl +dnl or +dnl +dnl * the GNU General Public License as published by the Free Software +dnl Foundation; either version 2 of the License, or (at your option) any +dnl later version. +dnl +dnl or both in parallel, as here. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +dnl for more details. +dnl +dnl You should have received copies of the GNU General Public License and the +dnl GNU Lesser General Public License along with the GNU MP Library. If not, +dnl see https://www.gnu.org/licenses/. + + +include(`../config.m4') + +C cycles/limb good for cpu? +C AMD K8,K9 n/a +C AMD K10 n/a +C AMD bd1 ? +C AMD bd2 ? +C AMD bd3 ? +C AMD bd4 ? +C AMD zen ? +C AMD bobcat ? +C AMD jaguar ? +C Intel P4 n/a +C Intel CNR 4.50 y +C Intel PNR 3.28 y +C Intel NHM ? +C Intel SBR ? +C Intel IBR ? +C Intel HWL ? +C Intel BWL ? +C Intel SKL ? +C Intel atom ? +C Intel SLM ? +C VIA nano ? + +C TODO +C * This was hand-written without too much thought about optimal insn +C selection; check to see of it can be improved. +C * Consider doing some instruction scheduling. + +define(`up', `%rdi') +define(`vp', `%rsi') +define(`n', `%rdx') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_hamdist) + lea L(cnsts)(%rip), %r9 + +ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48)', + `define(`OFF1',64) define(`OFF2',80)') + movdqa OFF1`'(%r9), %xmm7 + movdqa OFF2`'(%r9), %xmm6 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm8, %xmm8 + + mov R32(n), R32(%rax) + and $7, R32(%rax) +ifdef(`PIC',` + movslq (%r9,%rax,4), %rax + add %r9, %rax + jmp *%rax +',` + jmp *(%r9,%rax,8) +') + +L(1): movq (up), %xmm1 + add $8, up + movq (vp), %xmm10 + add $8, vp + pxor %xmm10, %xmm1 + jmp L(e1) + +L(2): add $-48, up + add $-48, vp + jmp L(e2) + +L(3): movq (up), %xmm1 + add $-40, up + movq (vp), %xmm10 + add $-40, vp + pxor %xmm10, %xmm1 + jmp L(e3) + +L(4): add $-32, up + add $-32, vp + jmp L(e4) + +L(5): movq (up), %xmm1 + add $-24, up + movq (vp), %xmm10 + add $-24, vp + pxor %xmm10, %xmm1 + jmp L(e5) + +L(6): add $-16, up + add $-16, vp + jmp L(e6) + +L(7): movq (up), %xmm1 + add $-8, up + movq (vp), %xmm10 + add $-8, vp + pxor %xmm10, %xmm1 + jmp L(e7) + + ALIGN(32) +L(top): lddqu (up), %xmm1 + lddqu (vp), %xmm10 + pxor %xmm10, %xmm1 +L(e7): movdqa %xmm6, %xmm0 C copy mask register + movdqa %xmm7, %xmm2 C copy count register + movdqa %xmm7, %xmm3 C copy count register + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e6): lddqu 16(up), %xmm1 + lddqu 16(vp), %xmm10 + pxor %xmm10, %xmm1 +L(e5): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e4): lddqu 32(up), %xmm1 + lddqu 32(vp), %xmm10 + pxor %xmm10, %xmm1 +L(e3): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + paddb %xmm2, %xmm3 + paddb %xmm3, %xmm4 +L(e2): lddqu 48(up), %xmm1 + add $64, up + lddqu 48(vp), %xmm10 + add $64, vp + pxor %xmm10, %xmm1 +L(e1): movdqa %xmm6, %xmm0 + movdqa %xmm7, %xmm2 + movdqa %xmm7, %xmm3 + pand %xmm1, %xmm0 + psrlw $4, %xmm1 + pand %xmm6, %xmm1 + pshufb %xmm0, %xmm2 + pshufb %xmm1, %xmm3 + psadbw %xmm5, %xmm4 C sum to 8 x 16-bit counts + paddb %xmm2, %xmm3 + paddq %xmm4, %xmm8 C sum to 2 x 64-bit counts + movdqa %xmm3, %xmm4 + sub $8, n + jg L(top) + + psadbw %xmm5, %xmm4 + paddq %xmm4, %xmm8 + pshufd $14, %xmm8, %xmm0 + paddq %xmm8, %xmm0 + movd %xmm0, %rax + ret +EPILOGUE() +DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') + JMPENT( L(top), L(cnsts)) + JMPENT( L(1), L(cnsts)) + JMPENT( L(2), L(cnsts)) + JMPENT( L(3), L(cnsts)) + JMPENT( L(4), L(cnsts)) + JMPENT( L(5), L(cnsts)) + JMPENT( L(6), L(cnsts)) + JMPENT( L(7), L(cnsts)) + .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 + .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f + .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f +END_OBJECT(L(cnsts)) |