dnl  ARM64 mpn_divrem_1 and mpn_preinv_divrem_1.

dnl  Contributed to the GNU project by Torbjörn Granlund.

dnl  Copyright 2020 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of either:
dnl
dnl    * the GNU Lesser General Public License as published by the Free
dnl      Software Foundation; either version 3 of the License, or (at your
dnl      option) any later version.
dnl
dnl  or
dnl
dnl    * the GNU General Public License as published by the Free Software
dnl      Foundation; either version 2 of the License, or (at your option) any
dnl      later version.
dnl
dnl  or both in parallel, as here.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
dnl  for more details.
dnl
dnl  You should have received copies of the GNU General Public License and the
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
dnl  see https://www.gnu.org/licenses/.

include(`../config.m4')

dnl TODO
dnl  * Handle the most significant quotient limb for the unnormalised case
dnl    specially, just like in the C code.  (It is very often 0.)

define(`qp_arg',   x0)
define(`fn_arg',   x1)
define(`np_arg',   x2)
define(`n_arg',    x3)
define(`d_arg',    x4)
define(`dinv_arg', x5)
define(`cnt_arg',  x6)

define(`qp',   x19)
define(`np',   x20)
define(`n',    x21)
define(`d',    x22)
define(`fn',   x24)
define(`dinv', x0)
define(`cnt',  x23)
define(`tnc',  x8)

dnl mp_limb_t
dnl mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
dnl               mp_srcptr np, mp_size_t n,
dnl               mp_limb_t d_unnorm)

dnl mp_limb_t
dnl mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
dnl                      mp_srcptr np, mp_size_t n,
dnl                      mp_limb_t d_unnorm, mp_limb_t dinv, int cnt)

ASM_START()

PROLOGUE(mpn_preinv_divrem_1)
	cbz	n_arg, L(fz)
	stp	x29, x30, [sp, #-80]!
	mov	x29, sp
	stp	x19, x20, [sp, #16]
	stp	x21, x22, [sp, #32]
	stp	x23, x24, [sp, #48]

	sub	n, n_arg, #1
	add	x7, n, fn_arg
	add	np, np_arg, n, lsl #3
	add	qp, qp_arg, x7, lsl #3
	mov	fn, fn_arg
	mov	d, d_arg
	mov	dinv, dinv_arg
	tbnz	d_arg, #63, L(nentry)
	mov	cnt, cnt_arg
	b	L(uentry)
EPILOGUE()

PROLOGUE(mpn_divrem_1)
	cbz	n_arg, L(fz)
	stp	x29, x30, [sp, #-80]!
	mov	x29, sp
	stp	x19, x20, [sp, #16]
	stp	x21, x22, [sp, #32]
	stp	x23, x24, [sp, #48]

	sub	n, n_arg, #1
	add	x7, n, fn_arg
	add	np, np_arg, n, lsl #3
	add	qp, qp_arg, x7, lsl #3
	mov	fn, fn_arg
	mov	d, d_arg
	tbnz	d_arg, #63, L(normalised)

L(unnorm):
	clz	cnt, d
	lsl	x0, d, cnt
	bl	GSYM_PREFIX`'MPN(invert_limb)
L(uentry):
	lsl	d, d, cnt
	ldr	x7, [np], #-8
	sub	tnc, xzr, cnt
	lsr	x11, x7, tnc		C r
	lsl	x1, x7, cnt
	cbz	n, L(uend)

L(utop):ldr	x7, [np], #-8
	add	x2, x11, #1
	mul	x10, x11, dinv
	umulh	x17, x11, dinv
	lsr	x9, x7, tnc
	orr	x1, x1, x9
	adds	x10, x1, x10
	adc	x2, x2, x17
	msub	x11, d, x2, x1
	lsl	x1, x7, cnt
	cmp	x10, x11
	add	x14, x11, d
	csel	x11, x14, x11, cc
	sbc	x2, x2, xzr
	cmp	x11, d
	bcs	L(ufx)
L(uok):	str	x2, [qp], #-8
	sub	n, n, #1
	cbnz	n, L(utop)

L(uend):add	x2, x11, #1
	mul	x10, x11, dinv
	umulh	x17, x11, dinv
	adds	x10, x1, x10
	adc	x2, x2, x17
	msub	x11, d, x2, x1
	cmp	x10, x11
	add	x14, x11, d
	csel	x11, x14, x11, cc
	sbc	x2, x2, xzr
	subs	x14, x11, d
	adc	x2, x2, xzr
	csel	x11, x14, x11, cs
	str	x2, [qp], #-8

	cbnz	fn, L(ftop)
	lsr	x0, x11, cnt
	ldp	x19, x20, [sp, #16]
	ldp	x21, x22, [sp, #32]
	ldp	x23, x24, [sp, #48]
	ldp	x29, x30, [sp], #80
	ret

L(ufx):	add	x2, x2, #1
	sub	x11, x11, d
	b	L(uok)


L(normalised):
	mov	x0, d
	bl	GSYM_PREFIX`'MPN(invert_limb)
L(nentry):
	ldr	x7, [np], #-8
	subs	x14, x7, d
	adc	x2, xzr, xzr		C hi q limb
	csel	x11, x14, x7, cs
	b	L(nok)

L(ntop):ldr	x1, [np], #-8
	add	x2, x11, #1
	mul	x10, x11, dinv
	umulh	x17, x11, dinv
	adds	x10, x1, x10
	adc	x2, x2, x17
	msub	x11, d, x2, x1
	cmp	x10, x11
	add	x14, x11, d
	csel	x11, x14, x11, cc	C remainder
	sbc	x2, x2, xzr
	cmp	x11, d
	bcs	L(nfx)
L(nok):	str	x2, [qp], #-8
	sub	n, n, #1
	tbz	n, #63, L(ntop)

L(nend):cbnz	fn, L(frac)
	mov	x0, x11
	ldp	x19, x20, [sp, #16]
	ldp	x21, x22, [sp, #32]
	ldp	x23, x24, [sp, #48]
	ldp	x29, x30, [sp], #80
	ret

L(nfx):	add	x2, x2, #1
	sub	x11, x11, d
	b	L(nok)

L(frac):mov	cnt, #0
L(ftop):add	x2, x11, #1
	mul	x10, x11, dinv
	umulh	x17, x11, dinv
	add	x2, x2, x17
	msub	x11, d, x2, xzr
	cmp	x10, x11
	add	x14, x11, d
	csel	x11, x14, x11, cc	C remainder
	sbc	x2, x2, xzr
	str	x2, [qp], #-8
	sub	fn, fn, #1
	cbnz	fn, L(ftop)

	lsr	x0, x11, cnt
	ldp	x19, x20, [sp, #16]
	ldp	x21, x22, [sp, #32]
	ldp	x23, x24, [sp, #48]
	ldp	x29, x30, [sp], #80
	ret

C Block zero. We need this for the degenerated case of n = 0, fn != 0.
L(fz):	cbz	fn_arg, L(zend)
L(ztop):str	xzr, [qp_arg], #8
	sub	fn_arg, fn_arg, #1
	cbnz	fn_arg, L(ztop)
L(zend):mov	x0, #0
	ret
EPILOGUE()