1 files changed, 281 insertions, 0 deletions
diff --git a/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm b/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm
new file mode 100644
index 0000000..c7f4426
--- /dev/null
+++ b/gmp-6.3.0/mpn/x86/pentium4/sse2/popcount.asm
@@ -0,0 +1,281 @@
+dnl  X86-32 and X86-64 mpn_popcount using SSE2.
+
+dnl  Copyright 2006, 2007, 2011, 2015, 2020 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+
+C 32-bit		     popcount	     hamdist
+C			    cycles/limb	    cycles/limb
+C P5				-
+C P6 model 0-8,10-12		-
+C P6 model 9  (Banias)		?
+C P6 model 13 (Dothan)		4
+C P4 model 0  (Willamette)	?
+C P4 model 1  (?)		?
+C P4 model 2  (Northwood)	3.9
+C P4 model 3  (Prescott)	?
+C P4 model 4  (Nocona)		?
+C AMD K6			-
+C AMD K7			-
+C AMD K8			?
+
+C 64-bit		     popcount	     hamdist
+C			    cycles/limb	    cycles/limb
+C P4 model 4 (Nocona):		8
+C AMD K8,K9			7.5
+C AMD K10			3.5
+C Intel core2			3.68
+C Intel corei			3.15
+C Intel atom		       10.8
+C VIA nano			6.5
+
+C TODO
+C  * Make an mpn_hamdist based on this.  Alignment could either be handled by
+C    using movdqu for one operand and movdqa for the other, or by painfully
+C    shifting as we go.  Unfortunately, there seem to be no usable shift
+C    instruction, except for one that takes an immediate count.
+C  * It would probably be possible to cut a few cycles/limb using software
+C    pipelining.
+C  * There are 35 decode slots unused by the SSE2 instructions.  Loop control
+C    needs just 2 or 3 slots, leaving around 32 slots.  This allows a parallel
+C    integer based popcount.  Such a combined loop would handle 6 limbs in
+C    about 30 cycles on K8.
+C  * We could save a byte or two by using 32-bit operations on areg.
+C  * Check if using movdqa to a temp of and then register-based pand is faster.
+
+ifelse(GMP_LIMB_BITS,`32',
+`	define(`up',  `%edx')
+	define(`n',   `%ecx')
+	define(`areg',`%eax')
+	define(`breg',`%ebx')
+	define(`zero',`%xmm4')
+	define(`LIMB32',`	$1')
+	define(`LIMB64',`dnl')
+',`
+	define(`up',  `%rdi')
+	define(`n',   `%rsi')
+	define(`areg',`%rax')
+	define(`breg',`%rdx')
+	define(`zero',`%xmm8')
+	define(`LIMB32',`dnl')
+	define(`LIMB64',`	$1')
+')
+
+define(`mm01010101',`%xmm6')
+define(`mm00110011',`%xmm7')
+define(`mm00001111',`%xmm2')
+
+define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
+define(`LIMBS_PER_XMM',  eval(16/GMP_LIMB_BYTES))
+define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
+
+undefine(`psadbw')			C override inherited m4 version
+
+C This file is shared between 32-bit and 64-bit builds.  Only the former has
+C LEAL.  Default LEAL as an alias of LEA.
+ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')')
+
+ASM_START()
+
+C Make cnsts global to work around Apple relocation bug.
+ifdef(`DARWIN',`
+	define(`cnsts', MPN(popccnsts))
+	GLOBL	cnsts')
+
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_popcount)
+
+LIMB32(`mov	4(%esp), up	')
+LIMB32(`mov	8(%esp), n	')
+LIMB32(`push	%ebx		')
+
+	pxor	%xmm3, %xmm3		C zero grand total count
+LIMB64(`pxor	zero, zero	')
+
+	LEAL(	cnsts, breg)
+
+	movdqa	-48(breg), mm01010101
+	movdqa	-32(breg), mm00110011
+	movdqa	-16(breg), mm00001111
+
+	mov	up, areg
+	and	$-16, up		C round `up' down to 128-bit boundary
+	and	$12, areg		C 32:areg = 0, 4, 8, 12
+					C 64:areg = 0, 8
+	movdqa	(up), %xmm0
+	pand	64(breg,areg,4), %xmm0
+	shr	$m4_log2(GMP_LIMB_BYTES), %eax
+	add	areg, n			C compensate n for rounded down `up'
+
+	pxor	%xmm4, %xmm4
+	sub	$LIMBS_PER_XMM, n
+	jbe	L(sum)
+
+	sub	$LIMBS_PER_XMM, n
+	ja	L(ent)
+	jmp	L(lsum)
+
+	ALIGN(16)
+L(top):	movdqa	(up), %xmm0
+L(ent):	movdqa	16(up), %xmm4
+
+	movdqa	%xmm0, %xmm1
+	movdqa	%xmm4, %xmm5
+	psrld	$1, %xmm0
+	psrld	$1, %xmm4
+	pand	mm01010101, %xmm0
+	pand	mm01010101, %xmm4
+	psubd	%xmm0, %xmm1
+	psubd	%xmm4, %xmm5
+
+	movdqa	%xmm1, %xmm0
+	movdqa	%xmm5, %xmm4
+	psrlq	$2, %xmm1
+	psrlq	$2, %xmm5
+	pand	mm00110011, %xmm0
+	pand	mm00110011, %xmm4
+	pand	mm00110011, %xmm1
+	pand	mm00110011, %xmm5
+	paddq	%xmm0, %xmm1
+	paddq	%xmm4, %xmm5
+
+LIMB32(`pxor	zero, zero	')
+
+	add	$32, up
+	sub	$LIMBS_PER_2XMM, n
+
+	paddq	%xmm5, %xmm1
+	movdqa	%xmm1, %xmm0
+	psrlq	$4, %xmm1
+	pand	mm00001111, %xmm0
+	pand	mm00001111, %xmm1
+	paddq	%xmm0, %xmm1
+
+	psadbw	zero, %xmm1
+	paddq	%xmm1, %xmm3		C add to grand total
+
+	jnc	L(top)
+L(end):
+	add	$LIMBS_PER_2XMM, n
+	jz	L(rt)
+	movdqa	(up), %xmm0
+	pxor	%xmm4, %xmm4
+	sub	$LIMBS_PER_XMM, n
+	jbe	L(sum)
+L(lsum):
+	movdqa	%xmm0, %xmm4
+	movdqa	16(up), %xmm0
+L(sum):
+	shl	$m4_log2(GMP_LIMB_BYTES), n
+	and	$12, n
+	pand	(breg,n,4), %xmm0
+
+	movdqa	%xmm0, %xmm1
+	movdqa	%xmm4, %xmm5
+	psrld	$1, %xmm0
+	psrld	$1, %xmm4
+	pand	mm01010101, %xmm0
+	pand	mm01010101, %xmm4
+	psubd	%xmm0, %xmm1
+	psubd	%xmm4, %xmm5
+
+	movdqa	%xmm1, %xmm0
+	movdqa	%xmm5, %xmm4
+	psrlq	$2, %xmm1
+	psrlq	$2, %xmm5
+	pand	mm00110011, %xmm0
+	pand	mm00110011, %xmm4
+	pand	mm00110011, %xmm1
+	pand	mm00110011, %xmm5
+	paddq	%xmm0, %xmm1
+	paddq	%xmm4, %xmm5
+
+LIMB32(`pxor	zero, zero	')
+
+	paddq	%xmm5, %xmm1
+	movdqa	%xmm1, %xmm0
+	psrlq	$4, %xmm1
+	pand	mm00001111, %xmm0
+	pand	mm00001111, %xmm1
+	paddq	%xmm0, %xmm1
+
+	psadbw	zero, %xmm1
+	paddq	%xmm1, %xmm3		C add to grand total
+
+
+C Add the two 64-bit halves of the grand total counter
+L(rt):	movdqa	%xmm3, %xmm0
+	psrldq	$8, %xmm3
+	paddq	%xmm3, %xmm0
+	movd	%xmm0, areg		C movq avoided due to gas bug
+
+LIMB32(`pop	%ebx		')
+	ret
+
+EPILOGUE()
+DEF_OBJECT(dummy,16)
+C Three magic constants used for masking out bits
+	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
+	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
+
+	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
+	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
+
+	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+	.byte	0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+cnsts:
+C Masks for high end of number
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+C Masks for low end of number
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+END_OBJECT(dummy)
+ASM_END()