From 11da511c784eca003deb90c23570f0873954e0de Mon Sep 17 00:00:00 2001 From: Duncan Wilkie Date: Sat, 18 Nov 2023 06:11:09 -0600 Subject: Initial commit. --- gmp-6.3.0/mpn/generic/add.c | 33 + gmp-6.3.0/mpn/generic/add_1.c | 33 + gmp-6.3.0/mpn/generic/add_err1_n.c | 100 ++ gmp-6.3.0/mpn/generic/add_err2_n.c | 116 ++ gmp-6.3.0/mpn/generic/add_err3_n.c | 131 +++ gmp-6.3.0/mpn/generic/add_n.c | 89 ++ gmp-6.3.0/mpn/generic/add_n_sub_n.c | 172 +++ gmp-6.3.0/mpn/generic/addmul_1.c | 145 +++ gmp-6.3.0/mpn/generic/bdiv_dbm1c.c | 58 + gmp-6.3.0/mpn/generic/bdiv_q.c | 76 ++ gmp-6.3.0/mpn/generic/bdiv_q_1.c | 121 +++ gmp-6.3.0/mpn/generic/bdiv_qr.c | 84 ++ gmp-6.3.0/mpn/generic/binvert.c | 106 ++ gmp-6.3.0/mpn/generic/broot.c | 195 ++++ gmp-6.3.0/mpn/generic/brootinv.c | 159 +++ gmp-6.3.0/mpn/generic/bsqrt.c | 47 + gmp-6.3.0/mpn/generic/bsqrtinv.c | 103 ++ gmp-6.3.0/mpn/generic/cmp.c | 33 + gmp-6.3.0/mpn/generic/cnd_add_n.c | 69 ++ gmp-6.3.0/mpn/generic/cnd_sub_n.c | 69 ++ gmp-6.3.0/mpn/generic/cnd_swap.c | 50 + gmp-6.3.0/mpn/generic/com.c | 44 + gmp-6.3.0/mpn/generic/comb_tables.c | 47 + gmp-6.3.0/mpn/generic/compute_powtab.c | 373 +++++++ gmp-6.3.0/mpn/generic/copyd.c | 40 + gmp-6.3.0/mpn/generic/copyi.c | 42 + gmp-6.3.0/mpn/generic/dcpi1_bdiv_q.c | 161 +++ gmp-6.3.0/mpn/generic/dcpi1_bdiv_qr.c | 176 ++++ gmp-6.3.0/mpn/generic/dcpi1_div_q.c | 86 ++ gmp-6.3.0/mpn/generic/dcpi1_div_qr.c | 248 +++++ gmp-6.3.0/mpn/generic/dcpi1_divappr_q.c | 256 +++++ gmp-6.3.0/mpn/generic/div_q.c | 313 ++++++ gmp-6.3.0/mpn/generic/div_qr_1.c | 125 +++ gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c | 505 +++++++++ gmp-6.3.0/mpn/generic/div_qr_1n_pi2.c | 203 ++++ gmp-6.3.0/mpn/generic/div_qr_1u_pi2.c | 236 +++++ gmp-6.3.0/mpn/generic/div_qr_2.c | 314 ++++++ gmp-6.3.0/mpn/generic/div_qr_2n_pi1.c | 84 ++ gmp-6.3.0/mpn/generic/div_qr_2u_pi1.c | 76 ++ gmp-6.3.0/mpn/generic/dive_1.c | 146 +++ gmp-6.3.0/mpn/generic/diveby3.c | 173 +++ gmp-6.3.0/mpn/generic/divexact.c | 296 ++++++ gmp-6.3.0/mpn/generic/divis.c | 194 ++++ gmp-6.3.0/mpn/generic/divrem.c | 103 ++ gmp-6.3.0/mpn/generic/divrem_1.c | 254 +++++ gmp-6.3.0/mpn/generic/divrem_2.c | 118 +++ gmp-6.3.0/mpn/generic/dump.c | 99 ++ gmp-6.3.0/mpn/generic/fib2_ui.c | 174 +++ gmp-6.3.0/mpn/generic/fib2m.c | 252 +++++ gmp-6.3.0/mpn/generic/gcd.c | 266 +++++ gmp-6.3.0/mpn/generic/gcd_1.c | 103 ++ gmp-6.3.0/mpn/generic/gcd_11.c | 74 ++ gmp-6.3.0/mpn/generic/gcd_22.c | 131 +++ gmp-6.3.0/mpn/generic/gcd_subdiv_step.c | 204 ++++ gmp-6.3.0/mpn/generic/gcdext.c | 557 ++++++++++ gmp-6.3.0/mpn/generic/gcdext_1.c | 275 +++++ gmp-6.3.0/mpn/generic/gcdext_lehmer.c | 336 ++++++ gmp-6.3.0/mpn/generic/get_d.c | 438 ++++++++ gmp-6.3.0/mpn/generic/get_str.c | 451 ++++++++ gmp-6.3.0/mpn/generic/gmp-mparam.h | 33 + gmp-6.3.0/mpn/generic/hgcd.c | 182 ++++ gmp-6.3.0/mpn/generic/hgcd2-div.h | 504 +++++++++ gmp-6.3.0/mpn/generic/hgcd2.c | 283 +++++ gmp-6.3.0/mpn/generic/hgcd2_jacobi.c | 251 +++++ gmp-6.3.0/mpn/generic/hgcd_appr.c | 267 +++++ gmp-6.3.0/mpn/generic/hgcd_jacobi.c | 243 +++++ gmp-6.3.0/mpn/generic/hgcd_matrix.c | 265 +++++ gmp-6.3.0/mpn/generic/hgcd_reduce.c | 242 +++++ gmp-6.3.0/mpn/generic/hgcd_step.c | 127 +++ gmp-6.3.0/mpn/generic/invert.c | 86 ++ gmp-6.3.0/mpn/generic/invertappr.c | 300 ++++++ gmp-6.3.0/mpn/generic/jacbase.c | 242 +++++ gmp-6.3.0/mpn/generic/jacobi.c | 294 ++++++ gmp-6.3.0/mpn/generic/jacobi_2.c | 351 +++++++ gmp-6.3.0/mpn/generic/logops_n.c | 77 ++ gmp-6.3.0/mpn/generic/lshift.c | 72 ++ gmp-6.3.0/mpn/generic/lshiftc.c | 73 ++ gmp-6.3.0/mpn/generic/matrix22_mul.c | 321 ++++++ .../mpn/generic/matrix22_mul1_inverse_vector.c | 64 ++ gmp-6.3.0/mpn/generic/mod_1.c | 278 +++++ gmp-6.3.0/mpn/generic/mod_1_1.c | 341 ++++++ gmp-6.3.0/mpn/generic/mod_1_2.c | 148 +++ gmp-6.3.0/mpn/generic/mod_1_3.c | 155 +++ gmp-6.3.0/mpn/generic/mod_1_4.c | 170 +++ gmp-6.3.0/mpn/generic/mod_34lsub1.c | 128 +++ gmp-6.3.0/mpn/generic/mode1o.c | 235 +++++ gmp-6.3.0/mpn/generic/mu_bdiv_q.c | 281 +++++ gmp-6.3.0/mpn/generic/mu_bdiv_qr.c | 312 ++++++ gmp-6.3.0/mpn/generic/mu_div_q.c | 184 ++++ gmp-6.3.0/mpn/generic/mu_div_qr.c | 417 ++++++++ gmp-6.3.0/mpn/generic/mu_divappr_q.c | 368 +++++++ gmp-6.3.0/mpn/generic/mul.c | 441 ++++++++ gmp-6.3.0/mpn/generic/mul_1.c | 96 ++ gmp-6.3.0/mpn/generic/mul_basecase.c | 165 +++ gmp-6.3.0/mpn/generic/mul_fft.c | 1105 ++++++++++++++++++++ gmp-6.3.0/mpn/generic/mul_n.c | 96 ++ gmp-6.3.0/mpn/generic/mullo_basecase.c | 90 ++ gmp-6.3.0/mpn/generic/mullo_n.c | 243 +++++ gmp-6.3.0/mpn/generic/mulmid.c | 255 +++++ gmp-6.3.0/mpn/generic/mulmid_basecase.c | 82 ++ gmp-6.3.0/mpn/generic/mulmid_n.c | 61 ++ gmp-6.3.0/mpn/generic/mulmod_bknp1.c | 502 +++++++++ gmp-6.3.0/mpn/generic/mulmod_bnm1.c | 374 +++++++ gmp-6.3.0/mpn/generic/neg.c | 33 + gmp-6.3.0/mpn/generic/nussbaumer_mul.c | 70 ++ gmp-6.3.0/mpn/generic/perfpow.c | 342 ++++++ gmp-6.3.0/mpn/generic/perfsqr.c | 238 +++++ gmp-6.3.0/mpn/generic/popham.c | 125 +++ gmp-6.3.0/mpn/generic/pow_1.c | 135 +++ gmp-6.3.0/mpn/generic/powlo.c | 188 ++++ gmp-6.3.0/mpn/generic/powm.c | 1003 ++++++++++++++++++ gmp-6.3.0/mpn/generic/pre_divrem_1.c | 145 +++ gmp-6.3.0/mpn/generic/pre_mod_1.c | 61 ++ gmp-6.3.0/mpn/generic/random.c | 50 + gmp-6.3.0/mpn/generic/random2.c | 105 ++ gmp-6.3.0/mpn/generic/redc_1.c | 56 + gmp-6.3.0/mpn/generic/redc_2.c | 110 ++ gmp-6.3.0/mpn/generic/redc_n.c | 80 ++ gmp-6.3.0/mpn/generic/remove.c | 182 ++++ gmp-6.3.0/mpn/generic/rootrem.c | 515 +++++++++ gmp-6.3.0/mpn/generic/rshift.c | 69 ++ gmp-6.3.0/mpn/generic/sbpi1_bdiv_q.c | 96 ++ gmp-6.3.0/mpn/generic/sbpi1_bdiv_qr.c | 82 ++ gmp-6.3.0/mpn/generic/sbpi1_bdiv_r.c | 79 ++ gmp-6.3.0/mpn/generic/sbpi1_div_q.c | 302 ++++++ gmp-6.3.0/mpn/generic/sbpi1_div_qr.c | 109 ++ gmp-6.3.0/mpn/generic/sbpi1_divappr_q.c | 198 ++++ gmp-6.3.0/mpn/generic/scan0.c | 59 ++ gmp-6.3.0/mpn/generic/scan1.c | 59 ++ gmp-6.3.0/mpn/generic/sec_aors_1.c | 59 ++ gmp-6.3.0/mpn/generic/sec_div.c | 131 +++ gmp-6.3.0/mpn/generic/sec_invert.c | 177 ++++ gmp-6.3.0/mpn/generic/sec_mul.c | 48 + gmp-6.3.0/mpn/generic/sec_pi1_div.c | 172 +++ gmp-6.3.0/mpn/generic/sec_powm.c | 430 ++++++++ gmp-6.3.0/mpn/generic/sec_sqr.c | 76 ++ gmp-6.3.0/mpn/generic/sec_tabselect.c | 134 +++ gmp-6.3.0/mpn/generic/set_str.c | 290 +++++ gmp-6.3.0/mpn/generic/sizeinbase.c | 49 + gmp-6.3.0/mpn/generic/sqr.c | 98 ++ gmp-6.3.0/mpn/generic/sqr_basecase.c | 361 +++++++ gmp-6.3.0/mpn/generic/sqrlo.c | 239 +++++ gmp-6.3.0/mpn/generic/sqrlo_basecase.c | 194 ++++ gmp-6.3.0/mpn/generic/sqrmod_bnm1.c | 328 ++++++ gmp-6.3.0/mpn/generic/sqrtrem.c | 555 ++++++++++ gmp-6.3.0/mpn/generic/strongfibo.c | 219 ++++ gmp-6.3.0/mpn/generic/sub.c | 33 + gmp-6.3.0/mpn/generic/sub_1.c | 33 + gmp-6.3.0/mpn/generic/sub_err1_n.c | 100 ++ gmp-6.3.0/mpn/generic/sub_err2_n.c | 116 ++ gmp-6.3.0/mpn/generic/sub_err3_n.c | 131 +++ gmp-6.3.0/mpn/generic/sub_n.c | 89 ++ gmp-6.3.0/mpn/generic/submul_1.c | 144 +++ gmp-6.3.0/mpn/generic/tdiv_qr.c | 386 +++++++ gmp-6.3.0/mpn/generic/toom22_mul.c | 222 ++++ gmp-6.3.0/mpn/generic/toom2_sqr.c | 155 +++ gmp-6.3.0/mpn/generic/toom32_mul.c | 320 ++++++ gmp-6.3.0/mpn/generic/toom33_mul.c | 316 ++++++ gmp-6.3.0/mpn/generic/toom3_sqr.c | 221 ++++ gmp-6.3.0/mpn/generic/toom42_mul.c | 234 +++++ gmp-6.3.0/mpn/generic/toom42_mulmid.c | 237 +++++ gmp-6.3.0/mpn/generic/toom43_mul.c | 238 +++++ gmp-6.3.0/mpn/generic/toom44_mul.c | 239 +++++ gmp-6.3.0/mpn/generic/toom4_sqr.c | 164 +++ gmp-6.3.0/mpn/generic/toom52_mul.c | 256 +++++ gmp-6.3.0/mpn/generic/toom53_mul.c | 331 ++++++ gmp-6.3.0/mpn/generic/toom54_mul.c | 142 +++ gmp-6.3.0/mpn/generic/toom62_mul.c | 310 ++++++ gmp-6.3.0/mpn/generic/toom63_mul.c | 231 ++++ gmp-6.3.0/mpn/generic/toom6_sqr.c | 181 ++++ gmp-6.3.0/mpn/generic/toom6h_mul.c | 262 +++++ gmp-6.3.0/mpn/generic/toom8_sqr.c | 225 ++++ gmp-6.3.0/mpn/generic/toom8h_mul.c | 305 ++++++ gmp-6.3.0/mpn/generic/toom_couple_handling.c | 80 ++ gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm1.c | 72 ++ gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm2.c | 97 ++ gmp-6.3.0/mpn/generic/toom_eval_pm1.c | 89 ++ gmp-6.3.0/mpn/generic/toom_eval_pm2.c | 130 +++ gmp-6.3.0/mpn/generic/toom_eval_pm2exp.c | 127 +++ gmp-6.3.0/mpn/generic/toom_eval_pm2rexp.c | 101 ++ gmp-6.3.0/mpn/generic/toom_interpolate_12pts.c | 374 +++++++ gmp-6.3.0/mpn/generic/toom_interpolate_16pts.c | 545 ++++++++++ gmp-6.3.0/mpn/generic/toom_interpolate_5pts.c | 198 ++++ gmp-6.3.0/mpn/generic/toom_interpolate_6pts.c | 241 +++++ gmp-6.3.0/mpn/generic/toom_interpolate_7pts.c | 274 +++++ gmp-6.3.0/mpn/generic/toom_interpolate_8pts.c | 211 ++++ gmp-6.3.0/mpn/generic/trialdiv.c | 131 +++ gmp-6.3.0/mpn/generic/udiv_w_sdiv.c | 141 +++ gmp-6.3.0/mpn/generic/zero.c | 41 + gmp-6.3.0/mpn/generic/zero_p.c | 33 + 190 files changed, 37323 insertions(+) create mode 100644 gmp-6.3.0/mpn/generic/add.c create mode 100644 gmp-6.3.0/mpn/generic/add_1.c create mode 100644 gmp-6.3.0/mpn/generic/add_err1_n.c create mode 100644 gmp-6.3.0/mpn/generic/add_err2_n.c create mode 100644 gmp-6.3.0/mpn/generic/add_err3_n.c create mode 100644 gmp-6.3.0/mpn/generic/add_n.c create mode 100644 gmp-6.3.0/mpn/generic/add_n_sub_n.c create mode 100644 gmp-6.3.0/mpn/generic/addmul_1.c create mode 100644 gmp-6.3.0/mpn/generic/bdiv_dbm1c.c create mode 100644 gmp-6.3.0/mpn/generic/bdiv_q.c create mode 100644 gmp-6.3.0/mpn/generic/bdiv_q_1.c create mode 100644 gmp-6.3.0/mpn/generic/bdiv_qr.c create mode 100644 gmp-6.3.0/mpn/generic/binvert.c create mode 100644 gmp-6.3.0/mpn/generic/broot.c create mode 100644 gmp-6.3.0/mpn/generic/brootinv.c create mode 100644 gmp-6.3.0/mpn/generic/bsqrt.c create mode 100644 gmp-6.3.0/mpn/generic/bsqrtinv.c create mode 100644 gmp-6.3.0/mpn/generic/cmp.c create mode 100644 gmp-6.3.0/mpn/generic/cnd_add_n.c create mode 100644 gmp-6.3.0/mpn/generic/cnd_sub_n.c create mode 100644 gmp-6.3.0/mpn/generic/cnd_swap.c create mode 100644 gmp-6.3.0/mpn/generic/com.c create mode 100644 gmp-6.3.0/mpn/generic/comb_tables.c create mode 100644 gmp-6.3.0/mpn/generic/compute_powtab.c create mode 100644 gmp-6.3.0/mpn/generic/copyd.c create mode 100644 gmp-6.3.0/mpn/generic/copyi.c create mode 100644 gmp-6.3.0/mpn/generic/dcpi1_bdiv_q.c create mode 100644 gmp-6.3.0/mpn/generic/dcpi1_bdiv_qr.c create mode 100644 gmp-6.3.0/mpn/generic/dcpi1_div_q.c create mode 100644 gmp-6.3.0/mpn/generic/dcpi1_div_qr.c create mode 100644 gmp-6.3.0/mpn/generic/dcpi1_divappr_q.c create mode 100644 gmp-6.3.0/mpn/generic/div_q.c create mode 100644 gmp-6.3.0/mpn/generic/div_qr_1.c create mode 100644 gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c create mode 100644 gmp-6.3.0/mpn/generic/div_qr_1n_pi2.c create mode 100644 gmp-6.3.0/mpn/generic/div_qr_1u_pi2.c create mode 100644 gmp-6.3.0/mpn/generic/div_qr_2.c create mode 100644 gmp-6.3.0/mpn/generic/div_qr_2n_pi1.c create mode 100644 gmp-6.3.0/mpn/generic/div_qr_2u_pi1.c create mode 100644 gmp-6.3.0/mpn/generic/dive_1.c create mode 100644 gmp-6.3.0/mpn/generic/diveby3.c create mode 100644 gmp-6.3.0/mpn/generic/divexact.c create mode 100644 gmp-6.3.0/mpn/generic/divis.c create mode 100644 gmp-6.3.0/mpn/generic/divrem.c create mode 100644 gmp-6.3.0/mpn/generic/divrem_1.c create mode 100644 gmp-6.3.0/mpn/generic/divrem_2.c create mode 100644 gmp-6.3.0/mpn/generic/dump.c create mode 100644 gmp-6.3.0/mpn/generic/fib2_ui.c create mode 100644 gmp-6.3.0/mpn/generic/fib2m.c create mode 100644 gmp-6.3.0/mpn/generic/gcd.c create mode 100644 gmp-6.3.0/mpn/generic/gcd_1.c create mode 100644 gmp-6.3.0/mpn/generic/gcd_11.c create mode 100644 gmp-6.3.0/mpn/generic/gcd_22.c create mode 100644 gmp-6.3.0/mpn/generic/gcd_subdiv_step.c create mode 100644 gmp-6.3.0/mpn/generic/gcdext.c create mode 100644 gmp-6.3.0/mpn/generic/gcdext_1.c create mode 100644 gmp-6.3.0/mpn/generic/gcdext_lehmer.c create mode 100644 gmp-6.3.0/mpn/generic/get_d.c create mode 100644 gmp-6.3.0/mpn/generic/get_str.c create mode 100644 gmp-6.3.0/mpn/generic/gmp-mparam.h create mode 100644 gmp-6.3.0/mpn/generic/hgcd.c create mode 100644 gmp-6.3.0/mpn/generic/hgcd2-div.h create mode 100644 gmp-6.3.0/mpn/generic/hgcd2.c create mode 100644 gmp-6.3.0/mpn/generic/hgcd2_jacobi.c create mode 100644 gmp-6.3.0/mpn/generic/hgcd_appr.c create mode 100644 gmp-6.3.0/mpn/generic/hgcd_jacobi.c create mode 100644 gmp-6.3.0/mpn/generic/hgcd_matrix.c create mode 100644 gmp-6.3.0/mpn/generic/hgcd_reduce.c create mode 100644 gmp-6.3.0/mpn/generic/hgcd_step.c create mode 100644 gmp-6.3.0/mpn/generic/invert.c create mode 100644 gmp-6.3.0/mpn/generic/invertappr.c create mode 100644 gmp-6.3.0/mpn/generic/jacbase.c create mode 100644 gmp-6.3.0/mpn/generic/jacobi.c create mode 100644 gmp-6.3.0/mpn/generic/jacobi_2.c create mode 100644 gmp-6.3.0/mpn/generic/logops_n.c create mode 100644 gmp-6.3.0/mpn/generic/lshift.c create mode 100644 gmp-6.3.0/mpn/generic/lshiftc.c create mode 100644 gmp-6.3.0/mpn/generic/matrix22_mul.c create mode 100644 gmp-6.3.0/mpn/generic/matrix22_mul1_inverse_vector.c create mode 100644 gmp-6.3.0/mpn/generic/mod_1.c create mode 100644 gmp-6.3.0/mpn/generic/mod_1_1.c create mode 100644 gmp-6.3.0/mpn/generic/mod_1_2.c create mode 100644 gmp-6.3.0/mpn/generic/mod_1_3.c create mode 100644 gmp-6.3.0/mpn/generic/mod_1_4.c create mode 100644 gmp-6.3.0/mpn/generic/mod_34lsub1.c create mode 100644 gmp-6.3.0/mpn/generic/mode1o.c create mode 100644 gmp-6.3.0/mpn/generic/mu_bdiv_q.c create mode 100644 gmp-6.3.0/mpn/generic/mu_bdiv_qr.c create mode 100644 gmp-6.3.0/mpn/generic/mu_div_q.c create mode 100644 gmp-6.3.0/mpn/generic/mu_div_qr.c create mode 100644 gmp-6.3.0/mpn/generic/mu_divappr_q.c create mode 100644 gmp-6.3.0/mpn/generic/mul.c create mode 100644 gmp-6.3.0/mpn/generic/mul_1.c create mode 100644 gmp-6.3.0/mpn/generic/mul_basecase.c create mode 100644 gmp-6.3.0/mpn/generic/mul_fft.c create mode 100644 gmp-6.3.0/mpn/generic/mul_n.c create mode 100644 gmp-6.3.0/mpn/generic/mullo_basecase.c create mode 100644 gmp-6.3.0/mpn/generic/mullo_n.c create mode 100644 gmp-6.3.0/mpn/generic/mulmid.c create mode 100644 gmp-6.3.0/mpn/generic/mulmid_basecase.c create mode 100644 gmp-6.3.0/mpn/generic/mulmid_n.c create mode 100644 gmp-6.3.0/mpn/generic/mulmod_bknp1.c create mode 100644 gmp-6.3.0/mpn/generic/mulmod_bnm1.c create mode 100644 gmp-6.3.0/mpn/generic/neg.c create mode 100644 gmp-6.3.0/mpn/generic/nussbaumer_mul.c create mode 100644 gmp-6.3.0/mpn/generic/perfpow.c create mode 100644 gmp-6.3.0/mpn/generic/perfsqr.c create mode 100644 gmp-6.3.0/mpn/generic/popham.c create mode 100644 gmp-6.3.0/mpn/generic/pow_1.c create mode 100644 gmp-6.3.0/mpn/generic/powlo.c create mode 100644 gmp-6.3.0/mpn/generic/powm.c create mode 100644 gmp-6.3.0/mpn/generic/pre_divrem_1.c create mode 100644 gmp-6.3.0/mpn/generic/pre_mod_1.c create mode 100644 gmp-6.3.0/mpn/generic/random.c create mode 100644 gmp-6.3.0/mpn/generic/random2.c create mode 100644 gmp-6.3.0/mpn/generic/redc_1.c create mode 100644 gmp-6.3.0/mpn/generic/redc_2.c create mode 100644 gmp-6.3.0/mpn/generic/redc_n.c create mode 100644 gmp-6.3.0/mpn/generic/remove.c create mode 100644 gmp-6.3.0/mpn/generic/rootrem.c create mode 100644 gmp-6.3.0/mpn/generic/rshift.c create mode 100644 gmp-6.3.0/mpn/generic/sbpi1_bdiv_q.c create mode 100644 gmp-6.3.0/mpn/generic/sbpi1_bdiv_qr.c create mode 100644 gmp-6.3.0/mpn/generic/sbpi1_bdiv_r.c create mode 100644 gmp-6.3.0/mpn/generic/sbpi1_div_q.c create mode 100644 gmp-6.3.0/mpn/generic/sbpi1_div_qr.c create mode 100644 gmp-6.3.0/mpn/generic/sbpi1_divappr_q.c create mode 100644 gmp-6.3.0/mpn/generic/scan0.c create mode 100644 gmp-6.3.0/mpn/generic/scan1.c create mode 100644 gmp-6.3.0/mpn/generic/sec_aors_1.c create mode 100644 gmp-6.3.0/mpn/generic/sec_div.c create mode 100644 gmp-6.3.0/mpn/generic/sec_invert.c create mode 100644 gmp-6.3.0/mpn/generic/sec_mul.c create mode 100644 gmp-6.3.0/mpn/generic/sec_pi1_div.c create mode 100644 gmp-6.3.0/mpn/generic/sec_powm.c create mode 100644 gmp-6.3.0/mpn/generic/sec_sqr.c create mode 100644 gmp-6.3.0/mpn/generic/sec_tabselect.c create mode 100644 gmp-6.3.0/mpn/generic/set_str.c create mode 100644 gmp-6.3.0/mpn/generic/sizeinbase.c create mode 100644 gmp-6.3.0/mpn/generic/sqr.c create mode 100644 gmp-6.3.0/mpn/generic/sqr_basecase.c create mode 100644 gmp-6.3.0/mpn/generic/sqrlo.c create mode 100644 gmp-6.3.0/mpn/generic/sqrlo_basecase.c create mode 100644 gmp-6.3.0/mpn/generic/sqrmod_bnm1.c create mode 100644 gmp-6.3.0/mpn/generic/sqrtrem.c create mode 100644 gmp-6.3.0/mpn/generic/strongfibo.c create mode 100644 gmp-6.3.0/mpn/generic/sub.c create mode 100644 gmp-6.3.0/mpn/generic/sub_1.c create mode 100644 gmp-6.3.0/mpn/generic/sub_err1_n.c create mode 100644 gmp-6.3.0/mpn/generic/sub_err2_n.c create mode 100644 gmp-6.3.0/mpn/generic/sub_err3_n.c create mode 100644 gmp-6.3.0/mpn/generic/sub_n.c create mode 100644 gmp-6.3.0/mpn/generic/submul_1.c create mode 100644 gmp-6.3.0/mpn/generic/tdiv_qr.c create mode 100644 gmp-6.3.0/mpn/generic/toom22_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom2_sqr.c create mode 100644 gmp-6.3.0/mpn/generic/toom32_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom33_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom3_sqr.c create mode 100644 gmp-6.3.0/mpn/generic/toom42_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom42_mulmid.c create mode 100644 gmp-6.3.0/mpn/generic/toom43_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom44_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom4_sqr.c create mode 100644 gmp-6.3.0/mpn/generic/toom52_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom53_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom54_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom62_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom63_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom6_sqr.c create mode 100644 gmp-6.3.0/mpn/generic/toom6h_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom8_sqr.c create mode 100644 gmp-6.3.0/mpn/generic/toom8h_mul.c create mode 100644 gmp-6.3.0/mpn/generic/toom_couple_handling.c create mode 100644 gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm1.c create mode 100644 gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm2.c create mode 100644 gmp-6.3.0/mpn/generic/toom_eval_pm1.c create mode 100644 gmp-6.3.0/mpn/generic/toom_eval_pm2.c create mode 100644 gmp-6.3.0/mpn/generic/toom_eval_pm2exp.c create mode 100644 gmp-6.3.0/mpn/generic/toom_eval_pm2rexp.c create mode 100644 gmp-6.3.0/mpn/generic/toom_interpolate_12pts.c create mode 100644 gmp-6.3.0/mpn/generic/toom_interpolate_16pts.c create mode 100644 gmp-6.3.0/mpn/generic/toom_interpolate_5pts.c create mode 100644 gmp-6.3.0/mpn/generic/toom_interpolate_6pts.c create mode 100644 gmp-6.3.0/mpn/generic/toom_interpolate_7pts.c create mode 100644 gmp-6.3.0/mpn/generic/toom_interpolate_8pts.c create mode 100644 gmp-6.3.0/mpn/generic/trialdiv.c create mode 100644 gmp-6.3.0/mpn/generic/udiv_w_sdiv.c create mode 100644 gmp-6.3.0/mpn/generic/zero.c create mode 100644 gmp-6.3.0/mpn/generic/zero_p.c (limited to 'gmp-6.3.0/mpn/generic') diff --git a/gmp-6.3.0/mpn/generic/add.c b/gmp-6.3.0/mpn/generic/add.c new file mode 100644 index 0000000..4a6e3ba --- /dev/null +++ b/gmp-6.3.0/mpn/generic/add.c @@ -0,0 +1,33 @@ +/* mpn_add - add mpn to mpn. + +Copyright 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __GMP_FORCE_mpn_add 1 + +#include "gmp-impl.h" diff --git a/gmp-6.3.0/mpn/generic/add_1.c b/gmp-6.3.0/mpn/generic/add_1.c new file mode 100644 index 0000000..1745aed --- /dev/null +++ b/gmp-6.3.0/mpn/generic/add_1.c @@ -0,0 +1,33 @@ +/* mpn_add_1 - add limb to mpn. + +Copyright 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __GMP_FORCE_mpn_add_1 1 + +#include "gmp-impl.h" diff --git a/gmp-6.3.0/mpn/generic/add_err1_n.c b/gmp-6.3.0/mpn/generic/add_err1_n.c new file mode 100644 index 0000000..b247f19 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/add_err1_n.c @@ -0,0 +1,100 @@ +/* mpn_add_err1_n -- add_n with one error term + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* + Computes: + + (1) {rp,n} := {up,n} + {vp,n} (just like mpn_add_n) with incoming carry cy, + return value is carry out. + + (2) Let c[i+1] = carry from i-th limb addition (c[0] = cy). + Computes c[1]*yp[n-1] + ... + c[n]*yp[0], stores two-limb result at ep. + + Requires n >= 1. + + None of the outputs may overlap each other or any of the inputs, except + that {rp,n} may be equal to {up,n} or {vp,n}. +*/ +mp_limb_t +mpn_add_err1_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, + mp_ptr ep, mp_srcptr yp, + mp_size_t n, mp_limb_t cy) +{ + mp_limb_t el, eh, ul, vl, yl, zl, rl, sl, cy1, cy2; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 2, up, n)); + ASSERT (! MPN_OVERLAP_P (ep, 2, vp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 2, yp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 2, rp, n)); + + yp += n - 1; + el = eh = 0; + + do + { + yl = *yp--; + ul = *up++; + vl = *vp++; + + /* ordinary add_n */ + ADDC_LIMB (cy1, sl, ul, vl); + ADDC_LIMB (cy2, rl, sl, cy); + cy = cy1 | cy2; + *rp++ = rl; + + /* update (eh:el) */ + zl = (-cy) & yl; + el += zl; + eh += el < zl; + } + while (--n); + +#if GMP_NAIL_BITS != 0 + eh = (eh << GMP_NAIL_BITS) + (el >> GMP_NUMB_BITS); + el &= GMP_NUMB_MASK; +#endif + + ep[0] = el; + ep[1] = eh; + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/add_err2_n.c b/gmp-6.3.0/mpn/generic/add_err2_n.c new file mode 100644 index 0000000..d584d6d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/add_err2_n.c @@ -0,0 +1,116 @@ +/* mpn_add_err2_n -- add_n with two error terms + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* + Computes: + + (1) {rp,n} := {up,n} + {vp,n} (just like mpn_add_n) with incoming carry cy, + return value is carry out. + + (2) Let c[i+1] = carry from i-th limb addition (c[0] = cy). + Computes c[1]*yp1[n-1] + ... + c[n]*yp1[0], + c[1]*yp2[n-1] + ... + c[n]*yp2[0], + stores two-limb results at {ep,2} and {ep+2,2} respectively. + + Requires n >= 1. + + None of the outputs may overlap each other or any of the inputs, except + that {rp,n} may be equal to {up,n} or {vp,n}. +*/ +mp_limb_t +mpn_add_err2_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, + mp_ptr ep, mp_srcptr yp1, mp_srcptr yp2, + mp_size_t n, mp_limb_t cy) +{ + mp_limb_t el1, eh1, el2, eh2, ul, vl, yl1, yl2, zl1, zl2, rl, sl, cy1, cy2; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp1, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp2, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, up, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, vp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, yp1, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, yp2, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, rp, n)); + + yp1 += n - 1; + yp2 += n - 1; + el1 = eh1 = 0; + el2 = eh2 = 0; + + do + { + yl1 = *yp1--; + yl2 = *yp2--; + ul = *up++; + vl = *vp++; + + /* ordinary add_n */ + ADDC_LIMB (cy1, sl, ul, vl); + ADDC_LIMB (cy2, rl, sl, cy); + cy = cy1 | cy2; + *rp++ = rl; + + /* update (eh1:el1) */ + zl1 = (-cy) & yl1; + el1 += zl1; + eh1 += el1 < zl1; + + /* update (eh2:el2) */ + zl2 = (-cy) & yl2; + el2 += zl2; + eh2 += el2 < zl2; + } + while (--n); + +#if GMP_NAIL_BITS != 0 + eh1 = (eh1 << GMP_NAIL_BITS) + (el1 >> GMP_NUMB_BITS); + el1 &= GMP_NUMB_MASK; + eh2 = (eh2 << GMP_NAIL_BITS) + (el2 >> GMP_NUMB_BITS); + el2 &= GMP_NUMB_MASK; +#endif + + ep[0] = el1; + ep[1] = eh1; + ep[2] = el2; + ep[3] = eh2; + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/add_err3_n.c b/gmp-6.3.0/mpn/generic/add_err3_n.c new file mode 100644 index 0000000..a6ed4dc --- /dev/null +++ b/gmp-6.3.0/mpn/generic/add_err3_n.c @@ -0,0 +1,131 @@ +/* mpn_add_err3_n -- add_n with three error terms + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* + Computes: + + (1) {rp,n} := {up,n} + {vp,n} (just like mpn_add_n) with incoming carry cy, + return value is carry out. + + (2) Let c[i+1] = carry from i-th limb addition (c[0] = cy). + Computes c[1]*yp1[n-1] + ... + c[n]*yp1[0], + c[1]*yp2[n-1] + ... + c[n]*yp2[0], + c[1]*yp3[n-1] + ... + c[n]*yp3[0], + stores two-limb results at {ep,2}, {ep+2,2} and {ep+4,2} respectively. + + Requires n >= 1. + + None of the outputs may overlap each other or any of the inputs, except + that {rp,n} may be equal to {up,n} or {vp,n}. +*/ +mp_limb_t +mpn_add_err3_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, + mp_ptr ep, mp_srcptr yp1, mp_srcptr yp2, mp_srcptr yp3, + mp_size_t n, mp_limb_t cy) +{ + mp_limb_t el1, eh1, el2, eh2, el3, eh3, ul, vl, yl1, yl2, yl3, zl1, zl2, zl3, rl, sl, cy1, cy2; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp1, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp2, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp3, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, up, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, vp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, yp1, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, yp2, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, yp3, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, rp, n)); + + yp1 += n - 1; + yp2 += n - 1; + yp3 += n - 1; + el1 = eh1 = 0; + el2 = eh2 = 0; + el3 = eh3 = 0; + + do + { + yl1 = *yp1--; + yl2 = *yp2--; + yl3 = *yp3--; + ul = *up++; + vl = *vp++; + + /* ordinary add_n */ + ADDC_LIMB (cy1, sl, ul, vl); + ADDC_LIMB (cy2, rl, sl, cy); + cy = cy1 | cy2; + *rp++ = rl; + + /* update (eh1:el1) */ + zl1 = (-cy) & yl1; + el1 += zl1; + eh1 += el1 < zl1; + + /* update (eh2:el2) */ + zl2 = (-cy) & yl2; + el2 += zl2; + eh2 += el2 < zl2; + + /* update (eh3:el3) */ + zl3 = (-cy) & yl3; + el3 += zl3; + eh3 += el3 < zl3; + } + while (--n); + +#if GMP_NAIL_BITS != 0 + eh1 = (eh1 << GMP_NAIL_BITS) + (el1 >> GMP_NUMB_BITS); + el1 &= GMP_NUMB_MASK; + eh2 = (eh2 << GMP_NAIL_BITS) + (el2 >> GMP_NUMB_BITS); + el2 &= GMP_NUMB_MASK; + eh3 = (eh3 << GMP_NAIL_BITS) + (el3 >> GMP_NUMB_BITS); + el3 &= GMP_NUMB_MASK; +#endif + + ep[0] = el1; + ep[1] = eh1; + ep[2] = el2; + ep[3] = eh2; + ep[4] = el3; + ep[5] = eh3; + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/add_n.c b/gmp-6.3.0/mpn/generic/add_n.c new file mode 100644 index 0000000..f62ac87 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/add_n.c @@ -0,0 +1,89 @@ +/* mpn_add_n -- Add equal length limb vectors. + +Copyright 1992-1994, 1996, 2000, 2002, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +#if GMP_NAIL_BITS == 0 + +mp_limb_t +mpn_add_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t ul, vl, sl, rl, cy, cy1, cy2; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_INCR_P (rp, up, n)); + ASSERT (MPN_SAME_OR_INCR_P (rp, vp, n)); + + cy = 0; + do + { + ul = *up++; + vl = *vp++; + sl = ul + vl; + cy1 = sl < ul; + rl = sl + cy; + cy2 = rl < sl; + cy = cy1 | cy2; + *rp++ = rl; + } + while (--n != 0); + + return cy; +} + +#endif + +#if GMP_NAIL_BITS >= 1 + +mp_limb_t +mpn_add_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t ul, vl, rl, cy; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_INCR_P (rp, up, n)); + ASSERT (MPN_SAME_OR_INCR_P (rp, vp, n)); + + cy = 0; + do + { + ul = *up++; + vl = *vp++; + rl = ul + vl + cy; + cy = rl >> GMP_NUMB_BITS; + *rp++ = rl & GMP_NUMB_MASK; + } + while (--n != 0); + + return cy; +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/add_n_sub_n.c b/gmp-6.3.0/mpn/generic/add_n_sub_n.c new file mode 100644 index 0000000..1e72b5d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/add_n_sub_n.c @@ -0,0 +1,172 @@ +/* mpn_add_n_sub_n -- Add and Subtract two limb vectors of equal, non-zero length. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1999-2001, 2006 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#ifndef L1_CACHE_SIZE +#define L1_CACHE_SIZE 8192 /* only 68040 has less than this */ +#endif + +#define PART_SIZE (L1_CACHE_SIZE / GMP_LIMB_BYTES / 6) + + +/* mpn_add_n_sub_n. + r1[] = s1[] + s2[] + r2[] = s1[] - s2[] + All operands have n limbs. + In-place operations allowed. */ +mp_limb_t +mpn_add_n_sub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n) +{ + mp_limb_t acyn, acyo; /* carry for add */ + mp_limb_t scyn, scyo; /* carry for subtract */ + mp_size_t off; /* offset in operands */ + mp_size_t this_n; /* size of current chunk */ + + /* We alternatingly add and subtract in chunks that fit into the (L1) + cache. Since the chunks are several hundred limbs, the function call + overhead is insignificant, but we get much better locality. */ + + /* We have three variant of the inner loop, the proper loop is chosen + depending on whether r1 or r2 are the same operand as s1 or s2. */ + + if (r1p != s1p && r1p != s2p) + { + /* r1 is not identical to either input operand. We can therefore write + to r1 directly, without using temporary storage. */ + acyo = 0; + scyo = 0; + for (off = 0; off < n; off += PART_SIZE) + { + this_n = MIN (n - off, PART_SIZE); +#if HAVE_NATIVE_mpn_add_nc + acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo); +#else + acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n); + acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo); +#endif +#if HAVE_NATIVE_mpn_sub_nc + scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); +#else + scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); + scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); +#endif + } + } + else if (r2p != s1p && r2p != s2p) + { + /* r2 is not identical to either input operand. We can therefore write + to r2 directly, without using temporary storage. */ + acyo = 0; + scyo = 0; + for (off = 0; off < n; off += PART_SIZE) + { + this_n = MIN (n - off, PART_SIZE); +#if HAVE_NATIVE_mpn_sub_nc + scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); +#else + scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); + scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); +#endif +#if HAVE_NATIVE_mpn_add_nc + acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo); +#else + acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n); + acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo); +#endif + } + } + else + { + /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2==s2 or vice versa) + Need temporary storage. */ + mp_limb_t tp[PART_SIZE]; + acyo = 0; + scyo = 0; + for (off = 0; off < n; off += PART_SIZE) + { + this_n = MIN (n - off, PART_SIZE); +#if HAVE_NATIVE_mpn_add_nc + acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo); +#else + acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n); + acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo); +#endif +#if HAVE_NATIVE_mpn_sub_nc + scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); +#else + scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); + scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); +#endif + MPN_COPY (r1p + off, tp, this_n); + } + } + + return 2 * acyo + scyo; +} + +#ifdef MAIN +#include +#include +#include "timing.h" + +long cputime (); + +int +main (int argc, char **argv) +{ + mp_ptr r1p, r2p, s1p, s2p; + double t; + mp_size_t n; + + n = strtol (argv[1], 0, 0); + + r1p = malloc (n * GMP_LIMB_BYTES); + r2p = malloc (n * GMP_LIMB_BYTES); + s1p = malloc (n * GMP_LIMB_BYTES); + s2p = malloc (n * GMP_LIMB_BYTES); + TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n))); + printf (" separate add and sub: %.3f\n", t); + TIME (t,mpn_add_n_sub_n(r1p,r2p,s1p,s2p,n)); + printf ("combined addsub separate variables: %.3f\n", t); + TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,s2p,n)); + printf (" combined addsub r1 overlap: %.3f\n", t); + TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,s2p,n)); + printf (" combined addsub r2 overlap: %.3f\n", t); + TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,r2p,n)); + printf (" combined addsub in-place: %.3f\n", t); + + return 0; +} +#endif diff --git a/gmp-6.3.0/mpn/generic/addmul_1.c b/gmp-6.3.0/mpn/generic/addmul_1.c new file mode 100644 index 0000000..6140e8e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/addmul_1.c @@ -0,0 +1,145 @@ +/* mpn_addmul_1 -- multiply the N long limb vector pointed to by UP by VL, + add the N least significant limbs of the product to the limb vector + pointed to by RP. Return the most significant limb of the product, + adjusted for carry-out from the addition. + +Copyright 1992-1994, 1996, 2000, 2002, 2004, 2016 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +#if GMP_NAIL_BITS == 0 + +mp_limb_t +mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) +{ + mp_limb_t u0, crec, c, p1, p0, r0; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + + crec = 0; + do + { + u0 = *up++; + umul_ppmm (p1, p0, u0, v0); + + r0 = *rp; + + p0 = r0 + p0; + c = r0 > p0; + + p1 = p1 + c; + + r0 = p0 + crec; /* cycle 0, 3, ... */ + c = p0 > r0; /* cycle 1, 4, ... */ + + crec = p1 + c; /* cycle 2, 5, ... */ + + *rp++ = r0; + } + while (--n != 0); + + return crec; +} + +#endif + +#if GMP_NAIL_BITS == 1 + +mp_limb_t +mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) +{ + mp_limb_t shifted_v0, u0, r0, p0, p1, prev_p1, crec, xl, c1, c2, c3; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT_MPN (rp, n); + ASSERT_MPN (up, n); + ASSERT_LIMB (v0); + + shifted_v0 = v0 << GMP_NAIL_BITS; + crec = 0; + prev_p1 = 0; + do + { + u0 = *up++; + r0 = *rp; + umul_ppmm (p1, p0, u0, shifted_v0); + p0 >>= GMP_NAIL_BITS; + ADDC_LIMB (c1, xl, prev_p1, p0); + ADDC_LIMB (c2, xl, xl, r0); + ADDC_LIMB (c3, xl, xl, crec); + crec = c1 + c2 + c3; + *rp++ = xl; + prev_p1 = p1; + } + while (--n != 0); + + return prev_p1 + crec; +} + +#endif + +#if GMP_NAIL_BITS >= 2 + +mp_limb_t +mpn_addmul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) +{ + mp_limb_t shifted_v0, u0, r0, p0, p1, prev_p1, xw, crec, xl; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT_MPN (rp, n); + ASSERT_MPN (up, n); + ASSERT_LIMB (v0); + + shifted_v0 = v0 << GMP_NAIL_BITS; + crec = 0; + prev_p1 = 0; + do + { + u0 = *up++; + r0 = *rp; + umul_ppmm (p1, p0, u0, shifted_v0); + p0 >>= GMP_NAIL_BITS; + xw = prev_p1 + p0 + r0 + crec; + crec = xw >> GMP_NUMB_BITS; + xl = xw & GMP_NUMB_MASK; + *rp++ = xl; + prev_p1 = p1; + } + while (--n != 0); + + return prev_p1 + crec; +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/bdiv_dbm1c.c b/gmp-6.3.0/mpn/generic/bdiv_dbm1c.c new file mode 100644 index 0000000..543bb6e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/bdiv_dbm1c.c @@ -0,0 +1,58 @@ +/* mpn_bdiv_dbm1c -- divide an mpn number by a divisor of B-1, where B is the + limb base. The dbm1c moniker means "Divisor of B Minus 1 with Carry". + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2008, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +mp_limb_t +mpn_bdiv_dbm1c (mp_ptr qp, mp_srcptr ap, mp_size_t n, mp_limb_t bd, mp_limb_t h) +{ + mp_limb_t a, p0, p1, cy; + mp_size_t i; + + for (i = 0; i < n; i++) + { + a = ap[i]; + umul_ppmm (p1, p0, a, bd << GMP_NAIL_BITS); + p0 >>= GMP_NAIL_BITS; + cy = h < p0; + h = (h - p0) & GMP_NUMB_MASK; + qp[i] = h; + h = h - p1 - cy; + } + + return h; +} diff --git a/gmp-6.3.0/mpn/generic/bdiv_q.c b/gmp-6.3.0/mpn/generic/bdiv_q.c new file mode 100644 index 0000000..52aa473 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/bdiv_q.c @@ -0,0 +1,76 @@ +/* mpn_bdiv_q -- Hensel division with precomputed inverse, returning quotient. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2007, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* Computes Q = N / D mod B^n. */ + +void +mpn_bdiv_q (mp_ptr qp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr tp) +{ + mp_limb_t di; + + if (BELOW_THRESHOLD (dn, DC_BDIV_Q_THRESHOLD)) + { + MPN_COPY (tp, np, nn); + binvert_limb (di, dp[0]); di = -di; + mpn_sbpi1_bdiv_q (qp, tp, nn, dp, dn, di); + } + else if (BELOW_THRESHOLD (dn, MU_BDIV_Q_THRESHOLD)) + { + MPN_COPY (tp, np, nn); + binvert_limb (di, dp[0]); di = -di; + mpn_dcpi1_bdiv_q (qp, tp, nn, dp, dn, di); + } + else + { + mpn_mu_bdiv_q (qp, np, nn, dp, dn, tp); + } + return; +} + +mp_size_t +mpn_bdiv_q_itch (mp_size_t nn, mp_size_t dn) +{ + if (BELOW_THRESHOLD (dn, MU_BDIV_Q_THRESHOLD)) + return nn; + else + return mpn_mu_bdiv_q_itch (nn, dn); +} diff --git a/gmp-6.3.0/mpn/generic/bdiv_q_1.c b/gmp-6.3.0/mpn/generic/bdiv_q_1.c new file mode 100644 index 0000000..6beb9a0 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/bdiv_q_1.c @@ -0,0 +1,121 @@ +/* mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel division by 1-limb + divisor, returning quotient only. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2000-2003, 2005, 2009, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_pi1_bdiv_q_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t d, + mp_limb_t di, int shift) +{ + mp_size_t i; + mp_limb_t c, h, l, u, u_next, dummy; + + ASSERT (n >= 1); + ASSERT (d != 0); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT_MPN (up, n); + ASSERT_LIMB (d); + + d <<= GMP_NAIL_BITS; + + if (shift != 0) + { + c = 0; + + u = up[0]; + rp--; + for (i = 1; i < n; i++) + { + u_next = up[i]; + u = ((u >> shift) | (u_next << (GMP_NUMB_BITS-shift))) & GMP_NUMB_MASK; + + SUBC_LIMB (c, l, u, c); + + l = (l * di) & GMP_NUMB_MASK; + rp[i] = l; + + umul_ppmm (h, dummy, l, d); + c += h; + u = u_next; + } + + u = u >> shift; + SUBC_LIMB (c, l, u, c); + + l = (l * di) & GMP_NUMB_MASK; + rp[n] = l; + } + else + { + u = up[0]; + l = (u * di) & GMP_NUMB_MASK; + rp[0] = l; + c = 0; + + for (i = 1; i < n; i++) + { + umul_ppmm (h, dummy, l, d); + c += h; + + u = up[i]; + SUBC_LIMB (c, l, u, c); + + l = (l * di) & GMP_NUMB_MASK; + rp[i] = l; + } + } + + return c; +} + +mp_limb_t +mpn_bdiv_q_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t d) +{ + mp_limb_t di; + int shift; + + ASSERT (n >= 1); + ASSERT (d != 0); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT_MPN (up, n); + ASSERT_LIMB (d); + + count_trailing_zeros (shift, d); + d >>= shift; + + binvert_limb (di, d); + return mpn_pi1_bdiv_q_1 (rp, up, n, d, di, shift); +} diff --git a/gmp-6.3.0/mpn/generic/bdiv_qr.c b/gmp-6.3.0/mpn/generic/bdiv_qr.c new file mode 100644 index 0000000..a4f0f39 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/bdiv_qr.c @@ -0,0 +1,84 @@ +/* mpn_bdiv_qr -- Hensel division with precomputed inverse, returning quotient + and remainder. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2007, 2009, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* Computes Q = N / D mod B^n, + R = N - QD. */ + +mp_limb_t +mpn_bdiv_qr (mp_ptr qp, mp_ptr rp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr tp) +{ + mp_limb_t di; + mp_limb_t rh; + + ASSERT (nn > dn); + if (BELOW_THRESHOLD (dn, DC_BDIV_QR_THRESHOLD) || + BELOW_THRESHOLD (nn - dn, DC_BDIV_QR_THRESHOLD)) + { + MPN_COPY (tp, np, nn); + binvert_limb (di, dp[0]); di = -di; + rh = mpn_sbpi1_bdiv_qr (qp, tp, nn, dp, dn, di); + MPN_COPY (rp, tp + nn - dn, dn); + } + else if (BELOW_THRESHOLD (dn, MU_BDIV_QR_THRESHOLD)) + { + MPN_COPY (tp, np, nn); + binvert_limb (di, dp[0]); di = -di; + rh = mpn_dcpi1_bdiv_qr (qp, tp, nn, dp, dn, di); + MPN_COPY (rp, tp + nn - dn, dn); + } + else + { + rh = mpn_mu_bdiv_qr (qp, rp, np, nn, dp, dn, tp); + } + + return rh; +} + +mp_size_t +mpn_bdiv_qr_itch (mp_size_t nn, mp_size_t dn) +{ + if (BELOW_THRESHOLD (dn, MU_BDIV_QR_THRESHOLD)) + return nn; + else + return mpn_mu_bdiv_qr_itch (nn, dn); +} diff --git a/gmp-6.3.0/mpn/generic/binvert.c b/gmp-6.3.0/mpn/generic/binvert.c new file mode 100644 index 0000000..a170e66 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/binvert.c @@ -0,0 +1,106 @@ +/* Compute {up,n}^(-1) mod B^n. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright (C) 2004-2007, 2009, 2012, 2017, 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* + r[k+1] = r[k] - r[k] * (u*r[k] - 1) + r[k+1] = r[k] + r[k] - r[k]*(u*r[k]) +*/ + +#if TUNE_PROGRAM_BUILD +#define NPOWS \ + ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t))) +#else +#define NPOWS \ + ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (BINV_NEWTON_THRESHOLD)) +#endif + +mp_size_t +mpn_binvert_itch (mp_size_t n) +{ + mp_size_t itch_local = mpn_mulmod_bnm1_next_size (n); + mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, n, (n + 1) >> 1); + return itch_local + itch_out; +} + +void +mpn_binvert (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_ptr scratch) +{ + mp_ptr xp; + mp_size_t rn, newrn; + mp_size_t sizes[NPOWS], *sizp; + mp_limb_t di; + + /* Compute the computation precisions from highest to lowest, leaving the + base case size in 'rn'. */ + sizp = sizes; + for (rn = n; ABOVE_THRESHOLD (rn, BINV_NEWTON_THRESHOLD); rn = (rn + 1) >> 1) + *sizp++ = rn; + + xp = scratch; + + /* Compute a base value of rn limbs. */ + MPN_ZERO (xp, rn); + xp[0] = 1; + binvert_limb (di, up[0]); + if (BELOW_THRESHOLD (rn, DC_BDIV_Q_THRESHOLD)) + mpn_sbpi1_bdiv_q (rp, xp, rn, up, rn, -di); + else + mpn_dcpi1_bdiv_q (rp, xp, rn, up, rn, -di); + + mpn_neg (rp, rp, rn); + + /* Use Newton iterations to get the desired precision. */ + for (; rn < n; rn = newrn) + { + mp_size_t m; + newrn = *--sizp; + + /* X <- UR. */ + m = mpn_mulmod_bnm1_next_size (newrn); + mpn_mulmod_bnm1 (xp, m, up, newrn, rp, rn, xp + m); + /* Only the values in the range xp + rn .. xp + newrn - 1 are + used by the _mullo_n below. + Since m >= newrn, we do not need the following. */ + /* mpn_sub_1 (xp + m, xp, rn - (m - newrn), 1); */ + + /* R = R(X/B^rn) */ + mpn_mullo_n (rp + rn, rp, xp + rn, newrn - rn); + mpn_neg (rp + rn, rp + rn, newrn - rn); + } +} diff --git a/gmp-6.3.0/mpn/generic/broot.c b/gmp-6.3.0/mpn/generic/broot.c new file mode 100644 index 0000000..02fe75a --- /dev/null +++ b/gmp-6.3.0/mpn/generic/broot.c @@ -0,0 +1,195 @@ +/* mpn_broot -- Compute hensel sqrt + + Contributed to the GNU project by Niels Möller + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Computes a^e (mod B). Uses right-to-left binary algorithm, since + typical use will have e small. */ +static mp_limb_t +powlimb (mp_limb_t a, mp_limb_t e) +{ + mp_limb_t r = 1; + mp_limb_t s = a; + + for (r = 1, s = a; e > 0; e >>= 1, s *= s) + if (e & 1) + r *= s; + + return r; +} + +/* Computes a^{1/k - 1} (mod B^n). Both a and k must be odd. + + Iterates + + r' <-- r - r * (a^{k-1} r^k - 1) / n + + If + + a^{k-1} r^k = 1 (mod 2^m), + + then + + a^{k-1} r'^k = 1 (mod 2^{2m}), + + Compute the update term as + + r' = r - (a^{k-1} r^{k+1} - r) / k + + where we still have cancellation of low limbs. + + */ +void +mpn_broot_invm1 (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k) +{ + mp_size_t sizes[GMP_LIMB_BITS * 2]; + mp_ptr akm1, tp, rnp, ep; + mp_limb_t a0, r0, km1, kp1h, kinv; + mp_size_t rn; + unsigned i; + + TMP_DECL; + + ASSERT (n > 0); + ASSERT (ap[0] & 1); + ASSERT (k & 1); + ASSERT (k >= 3); + + TMP_MARK; + + akm1 = TMP_ALLOC_LIMBS (4*n); + tp = akm1 + n; + + km1 = k-1; + /* FIXME: Could arrange the iteration so we don't need to compute + this up front, computing a^{k-1} * r^k as (a r)^{k-1} * r. Note + that we can use wraparound also for a*r, since the low half is + unchanged from the previous iteration. Or possibly mulmid. Also, + a r = a^{1/k}, so we get that value too, for free? */ + mpn_powlo (akm1, ap, &km1, 1, n, tp); /* 3 n scratch space */ + + a0 = ap[0]; + binvert_limb (kinv, k); + + /* 4 bits: a^{1/k - 1} (mod 16): + + a % 8 + 1 3 5 7 + k%4 +------- + 1 |1 1 1 1 + 3 |1 9 9 1 + */ + r0 = 1 + (((k << 2) & ((a0 << 1) ^ (a0 << 2))) & 8); + r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7f)); /* 8 bits */ + r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k & 0x7fff)); /* 16 bits */ + r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); /* 32 bits */ +#if GMP_NUMB_BITS > 32 + { + unsigned prec = 32; + do + { + r0 = kinv * r0 * (k+1 - akm1[0] * powlimb (r0, k)); + prec *= 2; + } + while (prec < GMP_NUMB_BITS); + } +#endif + + rp[0] = r0; + if (n == 1) + { + TMP_FREE; + return; + } + + /* For odd k, (k+1)/2 = k/2+1, and the latter avoids overflow. */ + kp1h = k/2 + 1; + + /* FIXME: Special case for two limb iteration. */ + rnp = TMP_ALLOC_LIMBS (2*n + 1); + ep = rnp + n; + + /* FIXME: Possible to this on the fly with some bit fiddling. */ + for (i = 0; n > 1; n = (n + 1)/2) + sizes[i++] = n; + + rn = 1; + + while (i-- > 0) + { + /* Compute x^{k+1}. */ + mpn_sqr (ep, rp, rn); /* For odd n, writes n+1 limbs in the + final iteration. */ + mpn_powlo (rnp, ep, &kp1h, 1, sizes[i], tp); + + /* Multiply by a^{k-1}. Can use wraparound; low part equals r. */ + + mpn_mullo_n (ep, rnp, akm1, sizes[i]); + ASSERT (mpn_cmp (ep, rp, rn) == 0); + + ASSERT (sizes[i] <= 2*rn); + mpn_pi1_bdiv_q_1 (rp + rn, ep + rn, sizes[i] - rn, k, kinv, 0); + mpn_neg (rp + rn, rp + rn, sizes[i] - rn); + rn = sizes[i]; + } + TMP_FREE; +} + +/* Computes a^{1/k} (mod B^n). Both a and k must be odd. */ +void +mpn_broot (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t k) +{ + mp_ptr tp; + TMP_DECL; + + ASSERT (n > 0); + ASSERT (ap[0] & 1); + ASSERT (k & 1); + + if (k == 1) + { + MPN_COPY (rp, ap, n); + return; + } + + TMP_MARK; + tp = TMP_ALLOC_LIMBS (n); + + mpn_broot_invm1 (tp, ap, n, k); + mpn_mullo_n (rp, tp, ap, n); + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/brootinv.c b/gmp-6.3.0/mpn/generic/brootinv.c new file mode 100644 index 0000000..e91b597 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/brootinv.c @@ -0,0 +1,159 @@ +/* mpn_brootinv, compute r such that r^k * y = 1 (mod 2^b). + + Contributed to the GNU project by Martin Boij (as part of perfpow.c). + +Copyright 2009, 2010, 2012, 2013, 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Computes a^2e (mod B). Uses right-to-left binary algorithm, since + typical use will have e small. */ +static mp_limb_t +powsquaredlimb (mp_limb_t a, mp_limb_t e) +{ + mp_limb_t r; + + r = 1; + /* if (LIKELY (e != 0)) */ + do { + a *= a; + if (e & 1) + r *= a; + e >>= 1; + } while (e != 0); + + return r; +} + +/* Compute r such that r^k * y = 1 (mod B^n). + + Iterates + r' <-- k^{-1} ((k+1) r - r^{k+1} y) (mod 2^b) + using Hensel lifting, each time doubling the number of known bits in r. + + Works just for odd k. Else the Hensel lifting degenerates. + + FIXME: + + (1) Make it work for k == GMP_LIMB_MAX (k+1 below overflows). + + (2) Rewrite iteration as + r' <-- r - k^{-1} r (r^k y - 1) + and take advantage of the zero low part of r^k y - 1. + + (3) Use wrap-around trick. + + (4) Use a small table to get starting value. + + Scratch need: bn + (((bn + 1) >> 1) + 1) + scratch for mpn_powlo + Currently mpn_powlo requires 3*bn + so that 5*bn is surely enough, where bn = ceil (bnb / GMP_NUMB_BITS). +*/ + +void +mpn_brootinv (mp_ptr rp, mp_srcptr yp, mp_size_t bn, mp_limb_t k, mp_ptr tp) +{ + mp_ptr tp2, tp3; + mp_limb_t kinv, k2, r0, y0; + mp_size_t order[GMP_LIMB_BITS + 1]; + int d; + + ASSERT (bn > 0); + ASSERT ((k & 1) != 0); + + tp2 = tp + bn; + tp3 = tp + bn + ((bn + 3) >> 1); + k2 = (k >> 1) + 1; /* (k + 1) / 2 , but avoid k+1 overflow */ + + binvert_limb (kinv, k); + + /* 4-bit initial approximation: + + y%16 | 1 3 5 7 9 11 13 15, + k%4 +-------------------------+k2%2 + 1 | 1 11 13 7 9 3 5 15 | 1 + 3 | 1 3 5 7 9 11 13 15 | 0 + + */ + y0 = yp[0]; + + r0 = y0 ^ (((y0 << 1) ^ (y0 << 2)) & (k2 << 3) & 8); /* 4 bits */ + r0 = kinv * (k2 * r0 * 2 - y0 * powsquaredlimb(r0, k2 & 0x3f)); /* 8 bits */ + r0 = kinv * (k2 * r0 * 2 - y0 * powsquaredlimb(r0, k2 & 0x3fff)); /* 16 bits */ +#if GMP_NUMB_BITS > 16 + { + unsigned prec = 16; + do + { + r0 = kinv * (k2 * r0 * 2 - y0 * powsquaredlimb(r0, k2)); + prec *= 2; + } + while (prec < GMP_NUMB_BITS); + } +#endif + + rp[0] = r0; + if (bn == 1) + return; + + d = 0; + for (; bn != 2; bn = (bn + 1) >> 1) + order[d++] = bn; + + order[d] = 2; + bn = 1; + + do + { + mpn_sqr (tp, rp, bn); /* Result may overlap tp2 */ + tp2[bn] = mpn_mul_1 (tp2, rp, bn, k2 << 1); + + bn = order[d]; + + mpn_powlo (rp, tp, &k2, 1, bn, tp3); + mpn_mullo_n (tp, yp, rp, bn); + + /* mpn_sub (tp, tp2, ((bn + 1) >> 1) + 1, tp, bn); */ + /* The function above is not handled, ((bn + 1) >> 1) + 1 <= bn*/ + { + mp_size_t pbn = (bn + 3) >> 1; /* Size of tp2 */ + int borrow; + borrow = mpn_sub_n (tp, tp2, tp, pbn) != 0; + if (bn > pbn) /* 3 < bn */ + { + if (borrow) + mpn_com (tp + pbn, tp + pbn, bn - pbn); + else + mpn_neg (tp + pbn, tp + pbn, bn - pbn); + } + } + mpn_pi1_bdiv_q_1 (rp, tp, bn, k, kinv, 0); + } + while (--d >= 0); +} diff --git a/gmp-6.3.0/mpn/generic/bsqrt.c b/gmp-6.3.0/mpn/generic/bsqrt.c new file mode 100644 index 0000000..27184f0 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/bsqrt.c @@ -0,0 +1,47 @@ +/* mpn_bsqrt, a^{1/2} (mod 2^n). + +Copyright 2009, 2010, 2012, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +void +mpn_bsqrt (mp_ptr rp, mp_srcptr ap, mp_bitcnt_t nb, mp_ptr tp) +{ + mp_ptr sp; + mp_size_t n; + + ASSERT (nb > 0); + + n = nb / GMP_NUMB_BITS; + sp = tp + n; + + mpn_bsqrtinv (tp, ap, nb, sp); + mpn_mullo_n (rp, tp, ap, n); +} diff --git a/gmp-6.3.0/mpn/generic/bsqrtinv.c b/gmp-6.3.0/mpn/generic/bsqrtinv.c new file mode 100644 index 0000000..c286773 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/bsqrtinv.c @@ -0,0 +1,103 @@ +/* mpn_bsqrtinv, compute r such that r^2 * y = 1 (mod 2^{b+1}). + + Contributed to the GNU project by Martin Boij (as part of perfpow.c). + +Copyright 2009, 2010, 2012, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Compute r such that r^2 * y = 1 (mod 2^{b+1}). + Return non-zero if such an integer r exists. + + Iterates + r' <-- (3r - r^3 y) / 2 + using Hensel lifting. Since we divide by two, the Hensel lifting is + somewhat degenerates. Therefore, we lift from 2^b to 2^{b+1}-1. + + FIXME: + (1) Simplify to do precision book-keeping in limbs rather than bits. + + (2) Rewrite iteration as + r' <-- r - r (r^2 y - 1) / 2 + and take advantage of zero low part of r^2 y - 1. + + (3) Use wrap-around trick. + + (4) Use a small table to get starting value. +*/ +int +mpn_bsqrtinv (mp_ptr rp, mp_srcptr yp, mp_bitcnt_t bnb, mp_ptr tp) +{ + mp_ptr tp2; + mp_size_t bn, order[GMP_LIMB_BITS + 1]; + int i, d; + + ASSERT (bnb > 0); + + bn = 1 + bnb / GMP_LIMB_BITS; + + tp2 = tp + bn; + + rp[0] = 1; + if (bnb == 1) + { + if ((yp[0] & 3) != 1) + return 0; + } + else + { + if ((yp[0] & 7) != 1) + return 0; + + d = 0; + for (; bnb != 2; bnb = (bnb + 2) >> 1) + order[d++] = bnb; + + for (i = d - 1; i >= 0; i--) + { + bnb = order[i]; + bn = 1 + bnb / GMP_LIMB_BITS; + + mpn_sqrlo (tp, rp, bn); + mpn_mullo_n (tp2, rp, tp, bn); /* tp2 <- rp ^ 3 */ + + mpn_mul_1 (tp, rp, bn, 3); + + mpn_mullo_n (rp, yp, tp2, bn); + +#if HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (rp, tp, rp, bn); +#else + mpn_sub_n (tp2, tp, rp, bn); + mpn_rshift (rp, tp2, bn, 1); +#endif + } + } + return 1; +} diff --git a/gmp-6.3.0/mpn/generic/cmp.c b/gmp-6.3.0/mpn/generic/cmp.c new file mode 100644 index 0000000..940314b --- /dev/null +++ b/gmp-6.3.0/mpn/generic/cmp.c @@ -0,0 +1,33 @@ +/* mpn_cmp -- Compare two low-level natural-number integers. + +Copyright 1991, 1993, 1994, 1996, 2000, 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __GMP_FORCE_mpn_cmp 1 + +#include "gmp-impl.h" diff --git a/gmp-6.3.0/mpn/generic/cnd_add_n.c b/gmp-6.3.0/mpn/generic/cnd_add_n.c new file mode 100644 index 0000000..e6b1373 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/cnd_add_n.c @@ -0,0 +1,69 @@ +/* mpn_cnd_add_n -- Compute R = U + V if CND != 0 or R = U if CND == 0. + Both cases should take the same time and perform the exact same memory + accesses, since this function is intended to be used where side-channel + attack resilience is relevant. + +Copyright 1992-1994, 1996, 2000, 2002, 2008, 2009, 2011, 2013 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +mp_limb_t +mpn_cnd_add_n (mp_limb_t cnd, mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t ul, vl, sl, rl, cy, cy1, cy2, mask; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n)); + + mask = -(mp_limb_t) (cnd != 0); + cy = 0; + do + { + ul = *up++; + vl = *vp++ & mask; +#if GMP_NAIL_BITS == 0 + sl = ul + vl; + cy1 = sl < ul; + rl = sl + cy; + cy2 = rl < sl; + cy = cy1 | cy2; + *rp++ = rl; +#else + rl = ul + vl; + rl += cy; + cy = rl >> GMP_NUMB_BITS; + *rp++ = rl & GMP_NUMB_MASK; +#endif + } + while (--n != 0); + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/cnd_sub_n.c b/gmp-6.3.0/mpn/generic/cnd_sub_n.c new file mode 100644 index 0000000..d04ad8a --- /dev/null +++ b/gmp-6.3.0/mpn/generic/cnd_sub_n.c @@ -0,0 +1,69 @@ +/* mpn_cnd_sub_n -- Compute R = U - V if CND != 0 or R = U if CND == 0. + Both cases should take the same time and perform the exact same memory + accesses, since this function is intended to be used where side-channel + attack resilience is relevant. + +Copyright 1992-1994, 1996, 2000, 2002, 2008, 2009, 2011, 2013 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +mp_limb_t +mpn_cnd_sub_n (mp_limb_t cnd, mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t ul, vl, sl, rl, cy, cy1, cy2, mask; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n)); + + mask = -(mp_limb_t) (cnd != 0); + cy = 0; + do + { + ul = *up++; + vl = *vp++ & mask; +#if GMP_NAIL_BITS == 0 + sl = ul - vl; + cy1 = sl > ul; + rl = sl - cy; + cy2 = rl > sl; + cy = cy1 | cy2; + *rp++ = rl; +#else + rl = ul - vl; + rl -= cy; + cy = rl >> (GMP_LIMB_BITS - 1); + *rp++ = rl & GMP_NUMB_MASK; +#endif + } + while (--n != 0); + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/cnd_swap.c b/gmp-6.3.0/mpn/generic/cnd_swap.c new file mode 100644 index 0000000..83d856d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/cnd_swap.c @@ -0,0 +1,50 @@ +/* mpn_cnd_swap + + Contributed to the GNU project by Niels Möller + +Copyright 2013, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +void +mpn_cnd_swap (mp_limb_t cnd, volatile mp_limb_t *ap, volatile mp_limb_t *bp, + mp_size_t n) +{ + volatile mp_limb_t mask = - (mp_limb_t) (cnd != 0); + mp_size_t i; + for (i = 0; i < n; i++) + { + mp_limb_t a, b, t; + a = ap[i]; + b = bp[i]; + t = (a ^ b) & mask; + ap[i] = a ^ t; + bp[i] = b ^ t; + } +} diff --git a/gmp-6.3.0/mpn/generic/com.c b/gmp-6.3.0/mpn/generic/com.c new file mode 100644 index 0000000..4de5824 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/com.c @@ -0,0 +1,44 @@ +/* mpn_com - complement an mpn. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#undef mpn_com +#define mpn_com __MPN(com) + +void +mpn_com (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + mp_limb_t ul; + do { + ul = *up++; + *rp++ = ~ul & GMP_NUMB_MASK; + } while (--n != 0); +} diff --git a/gmp-6.3.0/mpn/generic/comb_tables.c b/gmp-6.3.0/mpn/generic/comb_tables.c new file mode 100644 index 0000000..dedb77b --- /dev/null +++ b/gmp-6.3.0/mpn/generic/comb_tables.c @@ -0,0 +1,47 @@ +/* Const tables shared among combinatoric functions. + + THE CONTENTS OF THIS FILE ARE FOR INTERNAL USE AND ARE ALMOST CERTAIN TO + BE SUBJECT TO INCOMPATIBLE CHANGES IN FUTURE GNU MP RELEASES. + +Copyright 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Entry i contains (i!/2^t) where t is chosen such that the parenthesis + is an odd integer. */ +const mp_limb_t __gmp_oddfac_table[] = { ONE_LIMB_ODD_FACTORIAL_TABLE, ONE_LIMB_ODD_FACTORIAL_EXTTABLE }; + +/* Entry i contains ((2i+1)!!/2^t) where t is chosen such that the parenthesis + is an odd integer. */ +const mp_limb_t __gmp_odd2fac_table[] = { ONE_LIMB_ODD_DOUBLEFACTORIAL_TABLE }; + +/* Entry i contains 2i-popc(2i). */ +const unsigned char __gmp_fac2cnt_table[] = { TABLE_2N_MINUS_POPC_2N }; + +const mp_limb_t __gmp_limbroots_table[] = { NTH_ROOT_NUMB_MASK_TABLE }; diff --git a/gmp-6.3.0/mpn/generic/compute_powtab.c b/gmp-6.3.0/mpn/generic/compute_powtab.c new file mode 100644 index 0000000..f4fbc64 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/compute_powtab.c @@ -0,0 +1,373 @@ +/* mpn_compute_powtab. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1991-2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* + CAVEATS: + * The exptab and powtab vectors are in opposite orders. Probably OK. + * Consider getting rid of exptab, doing bit ops on the un argument instead. + * Consider rounding greatest power slightly upwards to save adjustments. + * In powtab_decide, consider computing cost from just the 2-3 largest + operands, since smaller operand contribute little. This makes most sense + if exptab is suppressed. +*/ + +#include "gmp-impl.h" + +#ifndef DIV_1_VS_MUL_1_PERCENT +#define DIV_1_VS_MUL_1_PERCENT 150 +#endif + +#define SET_powers_t(dest, ptr, size, dib, b, sh) \ + do { \ + dest.p = ptr; \ + dest.n = size; \ + dest.digits_in_base = dib; \ + dest.base = b; \ + dest.shift = sh; \ + } while (0) + +#if DIV_1_VS_MUL_1_PERCENT > 120 +#define HAVE_mpn_compute_powtab_mul 1 +static void +mpn_compute_powtab_mul (powers_t *powtab, mp_ptr powtab_mem, mp_size_t un, + int base, const size_t *exptab, size_t n_pows) +{ + mp_size_t n; + mp_ptr p, t; + mp_limb_t cy; + long start_idx; + int c; + + mp_limb_t big_base = mp_bases[base].big_base; + int chars_per_limb = mp_bases[base].chars_per_limb; + + mp_ptr powtab_mem_ptr = powtab_mem; + + size_t digits_in_base = chars_per_limb; + + powers_t *pt = powtab; + + p = powtab_mem_ptr; + powtab_mem_ptr += 1; + p[0] = big_base; + + SET_powers_t (pt[0], p, 1, digits_in_base, base, 0); + pt++; + + t = powtab_mem_ptr; + powtab_mem_ptr += 2; + t[1] = mpn_mul_1 (t, p, 1, big_base); + n = 2; + + digits_in_base *= 2; + + c = t[0] == 0; + t += c; + n -= c; + mp_size_t shift = c; + + SET_powers_t (pt[0], t, n, digits_in_base, base, shift); + p = t; + pt++; + + if (exptab[0] == ((size_t) chars_per_limb << n_pows)) + { + start_idx = n_pows - 2; + } + else + { + if (((digits_in_base + chars_per_limb) << (n_pows-2)) <= exptab[0]) + { + /* 3, sometimes adjusted to 4. */ + t = powtab_mem_ptr; + powtab_mem_ptr += 4; + t[n] = cy = mpn_mul_1 (t, p, n, big_base); + n += cy != 0;; + + digits_in_base += chars_per_limb; + + c = t[0] == 0; + t += c; + n -= c; + shift += c; + } + else + { + /* 2 copy, will always become 3 with back-multiplication. */ + t = powtab_mem_ptr; + powtab_mem_ptr += 3; + t[0] = p[0]; + t[1] = p[1]; + } + + SET_powers_t (pt[0], t, n, digits_in_base, base, shift); + p = t; + pt++; + start_idx = n_pows - 3; + } + + for (long pi = start_idx; pi >= 0; pi--) + { + t = powtab_mem_ptr; + powtab_mem_ptr += 2 * n + 2; + + ASSERT (powtab_mem_ptr < powtab_mem + mpn_str_powtab_alloc (un)); + + mpn_sqr (t, p, n); + + digits_in_base *= 2; + n *= 2; + n -= t[n - 1] == 0; + shift *= 2; + + c = t[0] == 0; + t += c; + n -= c; + shift += c; + + /* Adjust new value if it is too small as input to the next squaring. */ + if (((digits_in_base + chars_per_limb) << pi) <= exptab[0]) + { + t[n] = cy = mpn_mul_1 (t, t, n, big_base); + n += cy != 0; + + digits_in_base += chars_per_limb; + + c = t[0] == 0; + t += c; + n -= c; + shift += c; + } + + SET_powers_t (pt[0], t, n, digits_in_base, base, shift); + + /* Adjust previous value if it is not at its target power. */ + if (pt[-1].digits_in_base < exptab[pi + 1]) + { + mp_size_t n = pt[-1].n; + mp_ptr p = pt[-1].p; + p[n] = cy = mpn_mul_1 (p, p, n, big_base); + n += cy != 0; + + ASSERT (pt[-1].digits_in_base + chars_per_limb == exptab[pi + 1]); + pt[-1].digits_in_base = exptab[pi + 1]; + + c = p[0] == 0; + pt[-1].p = p + c; + pt[-1].n = n - c; + pt[-1].shift += c; + } + + p = t; + pt++; + } +} +#endif + +#if DIV_1_VS_MUL_1_PERCENT < 275 +#define HAVE_mpn_compute_powtab_div 1 +static void +mpn_compute_powtab_div (powers_t *powtab, mp_ptr powtab_mem, mp_size_t un, + int base, const size_t *exptab, size_t n_pows) +{ + mp_ptr p, t; + + mp_limb_t big_base = mp_bases[base].big_base; + int chars_per_limb = mp_bases[base].chars_per_limb; + + mp_ptr powtab_mem_ptr = powtab_mem; + + size_t digits_in_base = chars_per_limb; + + powers_t *pt = powtab; + + p = powtab_mem_ptr; + powtab_mem_ptr += 1; + p[0] = big_base; + + SET_powers_t (pt[0], p, 1, digits_in_base, base, 0); + pt++; + + mp_size_t n = 1; + mp_size_t shift = 0; + for (long pi = n_pows - 1; pi >= 0; pi--) + { + t = powtab_mem_ptr; + powtab_mem_ptr += 2 * n; + + ASSERT (powtab_mem_ptr < powtab_mem + mpn_str_powtab_alloc (un)); + + mpn_sqr (t, p, n); + n = 2 * n - 1; n += t[n] != 0; + digits_in_base *= 2; + + if (digits_in_base != exptab[pi]) /* if ((((un - 1) >> pi) & 2) == 0) */ + { +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 || ! HAVE_NATIVE_mpn_divexact_1 + if (__GMP_LIKELY (base == 10)) + mpn_pi1_bdiv_q_1 (t, t, n, big_base >> MP_BASES_BIG_BASE_CTZ_10, + MP_BASES_BIG_BASE_BINVERTED_10, + MP_BASES_BIG_BASE_CTZ_10); + else +#endif + /* FIXME: We could use _pi1 here if we add big_base_binverted and + big_base_ctz fields to struct bases. That would add about 2 KiB + to mp_bases.c. + FIXME: Use mpn_bdiv_q_1 here when mpn_divexact_1 is converted to + mpn_bdiv_q_1 for more machines. */ + mpn_divexact_1 (t, t, n, big_base); + + n -= t[n - 1] == 0; + digits_in_base -= chars_per_limb; + } + + shift *= 2; + /* Strip low zero limbs, but be careful to keep the result divisible by + big_base. */ + while (t[0] == 0 && (t[1] & ((big_base & -big_base) - 1)) == 0) + { + t++; + n--; + shift++; + } + p = t; + + SET_powers_t (pt[0], p, n, digits_in_base, base, shift); + pt++; + } + + /* Strip any remaining low zero limbs. */ + pt -= n_pows + 1; + for (long pi = n_pows; pi >= 0; pi--) + { + mp_ptr t = pt[pi].p; + mp_size_t shift = pt[pi].shift; + mp_size_t n = pt[pi].n; + int c; + c = t[0] == 0; + t += c; + n -= c; + shift += c; + pt[pi].p = t; + pt[pi].shift = shift; + pt[pi].n = n; + } +} +#endif + +static long +powtab_decide (size_t *exptab, size_t un, int base) +{ + int chars_per_limb = mp_bases[base].chars_per_limb; + long n_pows = 0; + for (size_t pn = (un + 1) >> 1; pn != 1; pn = (pn + 1) >> 1) + { + exptab[n_pows] = pn * chars_per_limb; + n_pows++; + } + exptab[n_pows] = chars_per_limb; + +#if HAVE_mpn_compute_powtab_mul && HAVE_mpn_compute_powtab_div + size_t pn = un - 1; + size_t xn = (un + 1) >> 1; + unsigned mcost = 1; + unsigned dcost = 1; + for (long i = n_pows - 2; i >= 0; i--) + { + size_t pow = (pn >> (i + 1)) + 1; + + if (pow & 1) + dcost += pow; + + if (xn != (pow << i)) + { + if (pow > 2 && (pow & 1) == 0) + mcost += 2 * pow; + else + mcost += pow; + } + else + { + if (pow & 1) + mcost += pow; + } + } + + dcost = dcost * DIV_1_VS_MUL_1_PERCENT / 100; + + if (mcost <= dcost) + return n_pows; + else + return -n_pows; +#elif HAVE_mpn_compute_powtab_mul + return n_pows; +#elif HAVE_mpn_compute_powtab_div + return -n_pows; +#else +#error "no powtab function available" +#endif +} + +size_t +mpn_compute_powtab (powers_t *powtab, mp_ptr powtab_mem, mp_size_t un, int base) +{ + size_t exptab[GMP_LIMB_BITS]; + + long n_pows = powtab_decide (exptab, un, base); + +#if HAVE_mpn_compute_powtab_mul && HAVE_mpn_compute_powtab_div + if (n_pows >= 0) + { + mpn_compute_powtab_mul (powtab, powtab_mem, un, base, exptab, n_pows); + return n_pows; + } + else + { + mpn_compute_powtab_div (powtab, powtab_mem, un, base, exptab, -n_pows); + return -n_pows; + } +#elif HAVE_mpn_compute_powtab_mul + ASSERT (n_pows > 0); + mpn_compute_powtab_mul (powtab, powtab_mem, un, base, exptab, n_pows); + return n_pows; +#elif HAVE_mpn_compute_powtab_div + ASSERT (n_pows < 0); + mpn_compute_powtab_div (powtab, powtab_mem, un, base, exptab, -n_pows); + return -n_pows; +#else +#error "no powtab function available" +#endif +} diff --git a/gmp-6.3.0/mpn/generic/copyd.c b/gmp-6.3.0/mpn/generic/copyd.c new file mode 100644 index 0000000..7def007 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/copyd.c @@ -0,0 +1,40 @@ +/* mpn_copyd + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +void +mpn_copyd (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + mp_size_t i; + + for (i = n - 1; i >= 0; i--) + rp[i] = up[i]; +} diff --git a/gmp-6.3.0/mpn/generic/copyi.c b/gmp-6.3.0/mpn/generic/copyi.c new file mode 100644 index 0000000..736e0b5 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/copyi.c @@ -0,0 +1,42 @@ +/* mpn_copyi + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +void +mpn_copyi (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + mp_size_t i; + + up += n; + rp += n; + for (i = -n; i != 0; i++) + rp[i] = up[i]; +} diff --git a/gmp-6.3.0/mpn/generic/dcpi1_bdiv_q.c b/gmp-6.3.0/mpn/generic/dcpi1_bdiv_q.c new file mode 100644 index 0000000..3c21818 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/dcpi1_bdiv_q.c @@ -0,0 +1,161 @@ +/* mpn_dcpi1_bdiv_q -- divide-and-conquer Hensel division with precomputed + inverse, returning quotient. + + Contributed to the GNU project by Niels Möller and Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2007, 2009-2011, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +#if 0 /* unused, so leave out for now */ +static mp_size_t +mpn_dcpi1_bdiv_q_n_itch (mp_size_t n) +{ + /* NOTE: Depends on mullo_n and mpn_dcpi1_bdiv_qr_n interface */ + return n; +} +#endif + +/* Computes Q = - N / D mod B^n, destroys N. + + N = {np,n} + D = {dp,n} +*/ + +static void +mpn_dcpi1_bdiv_q_n (mp_ptr qp, + mp_ptr np, mp_srcptr dp, mp_size_t n, + mp_limb_t dinv, mp_ptr tp) +{ + while (ABOVE_THRESHOLD (n, DC_BDIV_Q_THRESHOLD)) + { + mp_size_t lo, hi; + mp_limb_t cy; + + lo = n >> 1; /* floor(n/2) */ + hi = n - lo; /* ceil(n/2) */ + + cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, lo, dinv, tp); + + mpn_mullo_n (tp, qp, dp + hi, lo); + mpn_add_n (np + hi, np + hi, tp, lo); + + if (lo < hi) + { + cy += mpn_addmul_1 (np + lo, qp, lo, dp[lo]); + np[n - 1] += cy; + } + qp += lo; + np += lo; + n -= lo; + } + mpn_sbpi1_bdiv_q (qp, np, n, dp, n, dinv); +} + +/* Computes Q = - N / D mod B^nn, destroys N. + + N = {np,nn} + D = {dp,dn} +*/ + +void +mpn_dcpi1_bdiv_q (mp_ptr qp, + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_limb_t dinv) +{ + mp_size_t qn; + mp_limb_t cy; + mp_ptr tp; + TMP_DECL; + + TMP_MARK; + + ASSERT (dn >= 2); + ASSERT (nn - dn >= 0); + ASSERT (dp[0] & 1); + + tp = TMP_SALLOC_LIMBS (dn); + + qn = nn; + + if (qn > dn) + { + /* Reduce qn mod dn in a super-efficient manner. */ + do + qn -= dn; + while (qn > dn); + + /* Perform the typically smaller block first. */ + if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD)) + cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv); + else + cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, qn, dinv, tp); + + if (qn != dn) + { + if (qn > dn - qn) + mpn_mul (tp, qp, qn, dp + qn, dn - qn); + else + mpn_mul (tp, dp + qn, dn - qn, qp, qn); + mpn_incr_u (tp + qn, cy); + + mpn_add (np + qn, np + qn, nn - qn, tp, dn); + cy = 0; + } + + np += qn; + qp += qn; + + qn = nn - qn; + while (qn > dn) + { + mpn_add_1 (np + dn, np + dn, qn - dn, cy); + cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, dn, dinv, tp); + qp += dn; + np += dn; + qn -= dn; + } + mpn_dcpi1_bdiv_q_n (qp, np, dp, dn, dinv, tp); + } + else + { + if (BELOW_THRESHOLD (qn, DC_BDIV_Q_THRESHOLD)) + mpn_sbpi1_bdiv_q (qp, np, qn, dp, qn, dinv); + else + mpn_dcpi1_bdiv_q_n (qp, np, dp, qn, dinv, tp); + } + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/dcpi1_bdiv_qr.c b/gmp-6.3.0/mpn/generic/dcpi1_bdiv_qr.c new file mode 100644 index 0000000..11da44f --- /dev/null +++ b/gmp-6.3.0/mpn/generic/dcpi1_bdiv_qr.c @@ -0,0 +1,176 @@ +/* mpn_dcpi1_bdiv_qr -- divide-and-conquer Hensel division with precomputed + inverse, returning quotient and remainder. + + Contributed to the GNU project by Niels Möller and Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2007, 2009, 2010, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* Computes Hensel binary division of {np, 2*n} by {dp, n}. + + Output: + + q = -n * d^{-1} mod 2^{qn * GMP_NUMB_BITS}, + + r = (n + q * d) * 2^{-qn * GMP_NUMB_BITS} + + Stores q at qp. Stores the n least significant limbs of r at the high half + of np, and returns the carry from the addition n + q*d. + + d must be odd. dinv is (-d)^-1 mod 2^GMP_NUMB_BITS. */ + +mp_size_t +mpn_dcpi1_bdiv_qr_n_itch (mp_size_t n) +{ + return n; +} + +mp_limb_t +mpn_dcpi1_bdiv_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, + mp_limb_t dinv, mp_ptr tp) +{ + mp_size_t lo, hi; + mp_limb_t cy; + mp_limb_t rh; + + lo = n >> 1; /* floor(n/2) */ + hi = n - lo; /* ceil(n/2) */ + + if (BELOW_THRESHOLD (lo, DC_BDIV_QR_THRESHOLD)) + cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * lo, dp, lo, dinv); + else + cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, lo, dinv, tp); + + mpn_mul (tp, dp + lo, hi, qp, lo); + + mpn_incr_u (tp + lo, cy); + rh = mpn_add (np + lo, np + lo, n + hi, tp, n); + + if (BELOW_THRESHOLD (hi, DC_BDIV_QR_THRESHOLD)) + cy = mpn_sbpi1_bdiv_qr (qp + lo, np + lo, 2 * hi, dp, hi, dinv); + else + cy = mpn_dcpi1_bdiv_qr_n (qp + lo, np + lo, dp, hi, dinv, tp); + + mpn_mul (tp, qp + lo, hi, dp + hi, lo); + + mpn_incr_u (tp + hi, cy); + rh += mpn_add_n (np + n, np + n, tp, n); + + return rh; +} + +mp_limb_t +mpn_dcpi1_bdiv_qr (mp_ptr qp, mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) +{ + mp_size_t qn; + mp_limb_t rr, cy; + mp_ptr tp; + TMP_DECL; + + TMP_MARK; + + ASSERT (dn >= 2); /* to adhere to mpn_sbpi1_div_qr's limits */ + ASSERT (nn - dn >= 1); /* to adhere to mpn_sbpi1_div_qr's limits */ + ASSERT (dp[0] & 1); + + tp = TMP_SALLOC_LIMBS (dn); + + qn = nn - dn; + + if (qn > dn) + { + /* Reduce qn mod dn without division, optimizing small operations. */ + do + qn -= dn; + while (qn > dn); + + /* Perform the typically smaller block first. */ + if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD)) + cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv); + else + cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, qn, dinv, tp); + + rr = 0; + if (qn != dn) + { + if (qn > dn - qn) + mpn_mul (tp, qp, qn, dp + qn, dn - qn); + else + mpn_mul (tp, dp + qn, dn - qn, qp, qn); + mpn_incr_u (tp + qn, cy); + + rr = mpn_add (np + qn, np + qn, nn - qn, tp, dn); + cy = 0; + } + + np += qn; + qp += qn; + + qn = nn - dn - qn; + do + { + rr += mpn_add_1 (np + dn, np + dn, qn, cy); + cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, dn, dinv, tp); + qp += dn; + np += dn; + qn -= dn; + } + while (qn > 0); + TMP_FREE; + return rr + cy; + } + + if (BELOW_THRESHOLD (qn, DC_BDIV_QR_THRESHOLD)) + cy = mpn_sbpi1_bdiv_qr (qp, np, 2 * qn, dp, qn, dinv); + else + cy = mpn_dcpi1_bdiv_qr_n (qp, np, dp, qn, dinv, tp); + + rr = 0; + if (qn != dn) + { + if (qn > dn - qn) + mpn_mul (tp, qp, qn, dp + qn, dn - qn); + else + mpn_mul (tp, dp + qn, dn - qn, qp, qn); + mpn_incr_u (tp + qn, cy); + + rr = mpn_add (np + qn, np + qn, nn - qn, tp, dn); + cy = 0; + } + + TMP_FREE; + return rr + cy; +} diff --git a/gmp-6.3.0/mpn/generic/dcpi1_div_q.c b/gmp-6.3.0/mpn/generic/dcpi1_div_q.c new file mode 100644 index 0000000..1905c98 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/dcpi1_div_q.c @@ -0,0 +1,86 @@ +/* mpn_dc_div_q -- divide-and-conquer division, returning exact quotient + only. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2007, 2009, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +mp_limb_t +mpn_dcpi1_div_q (mp_ptr qp, mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, gmp_pi1_t *dinv) +{ + mp_ptr tp, wp; + mp_limb_t qh; + mp_size_t qn; + TMP_DECL; + + TMP_MARK; + + ASSERT (dn >= 6); + ASSERT (nn - dn >= 3); + ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT); + + tp = TMP_ALLOC_LIMBS (nn + 1); + MPN_COPY (tp + 1, np, nn); + tp[0] = 0; + + qn = nn - dn; + wp = TMP_ALLOC_LIMBS (qn + 1); + + qh = mpn_dcpi1_divappr_q (wp, tp, nn + 1, dp, dn, dinv); + + if (wp[0] == 0) + { + mp_limb_t cy; + + if (qn > dn) + mpn_mul (tp, wp + 1, qn, dp, dn); + else + mpn_mul (tp, dp, dn, wp + 1, qn); + + cy = (qh != 0) ? mpn_add_n (tp + qn, tp + qn, dp, dn) : 0; + + if (cy || mpn_cmp (tp, np, nn) > 0) /* At most is wrong by one, no cycle. */ + qh -= mpn_sub_1 (qp, wp + 1, qn, 1); + else /* Same as below */ + MPN_COPY (qp, wp + 1, qn); + } + else + MPN_COPY (qp, wp + 1, qn); + + TMP_FREE; + return qh; +} diff --git a/gmp-6.3.0/mpn/generic/dcpi1_div_qr.c b/gmp-6.3.0/mpn/generic/dcpi1_div_qr.c new file mode 100644 index 0000000..d7a65f8 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/dcpi1_div_qr.c @@ -0,0 +1,248 @@ +/* mpn_dcpi1_div_qr_n -- recursive divide-and-conquer division for arbitrary + size operands. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2007, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +mp_limb_t +mpn_dcpi1_div_qr_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, + gmp_pi1_t *dinv, mp_ptr tp) +{ + mp_size_t lo, hi; + mp_limb_t cy, qh, ql; + + lo = n >> 1; /* floor(n/2) */ + hi = n - lo; /* ceil(n/2) */ + + if (BELOW_THRESHOLD (hi, DC_DIV_QR_THRESHOLD)) + qh = mpn_sbpi1_div_qr (qp + lo, np + 2 * lo, 2 * hi, dp + lo, hi, dinv->inv32); + else + qh = mpn_dcpi1_div_qr_n (qp + lo, np + 2 * lo, dp + lo, hi, dinv, tp); + + mpn_mul (tp, qp + lo, hi, dp, lo); + + cy = mpn_sub_n (np + lo, np + lo, tp, n); + if (qh != 0) + cy += mpn_sub_n (np + n, np + n, dp, lo); + + while (cy != 0) + { + qh -= mpn_sub_1 (qp + lo, qp + lo, hi, 1); + cy -= mpn_add_n (np + lo, np + lo, dp, n); + } + + if (BELOW_THRESHOLD (lo, DC_DIV_QR_THRESHOLD)) + ql = mpn_sbpi1_div_qr (qp, np + hi, 2 * lo, dp + hi, lo, dinv->inv32); + else + ql = mpn_dcpi1_div_qr_n (qp, np + hi, dp + hi, lo, dinv, tp); + + mpn_mul (tp, dp, hi, qp, lo); + + cy = mpn_sub_n (np, np, tp, n); + if (ql != 0) + cy += mpn_sub_n (np + lo, np + lo, dp, hi); + + while (cy != 0) + { + mpn_sub_1 (qp, qp, lo, 1); + cy -= mpn_add_n (np, np, dp, n); + } + + return qh; +} + +mp_limb_t +mpn_dcpi1_div_qr (mp_ptr qp, + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + gmp_pi1_t *dinv) +{ + mp_size_t qn; + mp_limb_t qh, cy; + mp_ptr tp; + TMP_DECL; + + TMP_MARK; + + ASSERT (dn >= 6); /* to adhere to mpn_sbpi1_div_qr's limits */ + ASSERT (nn - dn >= 3); /* to adhere to mpn_sbpi1_div_qr's limits */ + ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT); + + tp = TMP_ALLOC_LIMBS (dn); + + qn = nn - dn; + qp += qn; + np += nn; + dp += dn; + + if (qn > dn) + { + /* Reduce qn mod dn without division, optimizing small operations. */ + do + qn -= dn; + while (qn > dn); + + qp -= qn; /* point at low limb of next quotient block */ + np -= qn; /* point in the middle of partial remainder */ + + /* Perform the typically smaller block first. */ + if (qn == 1) + { + mp_limb_t q, n2, n1, n0, d1, d0; + + /* Handle qh up front, for simplicity. */ + qh = mpn_cmp (np - dn + 1, dp - dn, dn) >= 0; + if (qh) + ASSERT_NOCARRY (mpn_sub_n (np - dn + 1, np - dn + 1, dp - dn, dn)); + + /* A single iteration of schoolbook: One 3/2 division, + followed by the bignum update and adjustment. */ + n2 = np[0]; + n1 = np[-1]; + n0 = np[-2]; + d1 = dp[-1]; + d0 = dp[-2]; + + ASSERT (n2 < d1 || (n2 == d1 && n1 <= d0)); + + if (UNLIKELY (n2 == d1) && n1 == d0) + { + q = GMP_NUMB_MASK; + cy = mpn_submul_1 (np - dn, dp - dn, dn, q); + ASSERT (cy == n2); + } + else + { + udiv_qr_3by2 (q, n1, n0, n2, n1, n0, d1, d0, dinv->inv32); + + if (dn > 2) + { + mp_limb_t cy, cy1; + cy = mpn_submul_1 (np - dn, dp - dn, dn - 2, q); + + cy1 = n0 < cy; + n0 = (n0 - cy) & GMP_NUMB_MASK; + cy = n1 < cy1; + n1 = (n1 - cy1) & GMP_NUMB_MASK; + np[-2] = n0; + + if (UNLIKELY (cy != 0)) + { + n1 += d1 + mpn_add_n (np - dn, np - dn, dp - dn, dn - 1); + qh -= (q == 0); + q = (q - 1) & GMP_NUMB_MASK; + } + } + else + np[-2] = n0; + + np[-1] = n1; + } + qp[0] = q; + } + else + { + /* Do a 2qn / qn division */ + if (qn == 2) + qh = mpn_divrem_2 (qp, 0L, np - 2, 4, dp - 2); /* FIXME: obsolete function. Use 5/3 division? */ + else if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD)) + qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32); + else + qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp); + + if (qn != dn) + { + if (qn > dn - qn) + mpn_mul (tp, qp, qn, dp - dn, dn - qn); + else + mpn_mul (tp, dp - dn, dn - qn, qp, qn); + + cy = mpn_sub_n (np - dn, np - dn, tp, dn); + if (qh != 0) + cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn); + + while (cy != 0) + { + qh -= mpn_sub_1 (qp, qp, qn, 1); + cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn); + } + } + } + + qn = nn - dn - qn; + do + { + qp -= dn; + np -= dn; + mpn_dcpi1_div_qr_n (qp, np - dn, dp - dn, dn, dinv, tp); + qn -= dn; + } + while (qn > 0); + } + else + { + qp -= qn; /* point at low limb of next quotient block */ + np -= qn; /* point in the middle of partial remainder */ + + if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD)) + qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32); + else + qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp); + + if (qn != dn) + { + if (qn > dn - qn) + mpn_mul (tp, qp, qn, dp - dn, dn - qn); + else + mpn_mul (tp, dp - dn, dn - qn, qp, qn); + + cy = mpn_sub_n (np - dn, np - dn, tp, dn); + if (qh != 0) + cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn); + + while (cy != 0) + { + qh -= mpn_sub_1 (qp, qp, qn, 1); + cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn); + } + } + } + + TMP_FREE; + return qh; +} diff --git a/gmp-6.3.0/mpn/generic/dcpi1_divappr_q.c b/gmp-6.3.0/mpn/generic/dcpi1_divappr_q.c new file mode 100644 index 0000000..0abe04e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/dcpi1_divappr_q.c @@ -0,0 +1,256 @@ +/* mpn_dcpi1_divappr_q -- divide-and-conquer division, returning approximate + quotient. The quotient returned is either correct, or one too large. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2007, 2009, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +static mp_limb_t +mpn_dcpi1_divappr_q_n (mp_ptr qp, mp_ptr np, mp_srcptr dp, mp_size_t n, + gmp_pi1_t *dinv, mp_ptr tp) +{ + mp_size_t lo, hi; + mp_limb_t cy, qh, ql; + + lo = n >> 1; /* floor(n/2) */ + hi = n - lo; /* ceil(n/2) */ + + if (BELOW_THRESHOLD (hi, DC_DIV_QR_THRESHOLD)) + qh = mpn_sbpi1_div_qr (qp + lo, np + 2 * lo, 2 * hi, dp + lo, hi, dinv->inv32); + else + qh = mpn_dcpi1_div_qr_n (qp + lo, np + 2 * lo, dp + lo, hi, dinv, tp); + + mpn_mul (tp, qp + lo, hi, dp, lo); + + cy = mpn_sub_n (np + lo, np + lo, tp, n); + if (qh != 0) + cy += mpn_sub_n (np + n, np + n, dp, lo); + + while (cy != 0) + { + qh -= mpn_sub_1 (qp + lo, qp + lo, hi, 1); + cy -= mpn_add_n (np + lo, np + lo, dp, n); + } + + if (BELOW_THRESHOLD (lo, DC_DIVAPPR_Q_THRESHOLD)) + ql = mpn_sbpi1_divappr_q (qp, np + hi, 2 * lo, dp + hi, lo, dinv->inv32); + else + ql = mpn_dcpi1_divappr_q_n (qp, np + hi, dp + hi, lo, dinv, tp); + + if (UNLIKELY (ql != 0)) + { + mp_size_t i; + for (i = 0; i < lo; i++) + qp[i] = GMP_NUMB_MASK; + } + + return qh; +} + +mp_limb_t +mpn_dcpi1_divappr_q (mp_ptr qp, mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, gmp_pi1_t *dinv) +{ + mp_size_t qn; + mp_limb_t qh, cy, qsave; + mp_ptr tp; + TMP_DECL; + + TMP_MARK; + + ASSERT (dn >= 6); + ASSERT (nn > dn); + ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT); + + qn = nn - dn; + qp += qn; + np += nn; + dp += dn; + + if (qn >= dn) + { + qn++; /* pretend we'll need an extra limb */ + /* Reduce qn mod dn without division, optimizing small operations. */ + do + qn -= dn; + while (qn > dn); + + qp -= qn; /* point at low limb of next quotient block */ + np -= qn; /* point in the middle of partial remainder */ + + tp = TMP_SALLOC_LIMBS (dn); + + /* Perform the typically smaller block first. */ + if (qn == 1) + { + mp_limb_t q, n2, n1, n0, d1, d0; + + /* Handle qh up front, for simplicity. */ + qh = mpn_cmp (np - dn + 1, dp - dn, dn) >= 0; + if (qh) + ASSERT_NOCARRY (mpn_sub_n (np - dn + 1, np - dn + 1, dp - dn, dn)); + + /* A single iteration of schoolbook: One 3/2 division, + followed by the bignum update and adjustment. */ + n2 = np[0]; + n1 = np[-1]; + n0 = np[-2]; + d1 = dp[-1]; + d0 = dp[-2]; + + ASSERT (n2 < d1 || (n2 == d1 && n1 <= d0)); + + if (UNLIKELY (n2 == d1) && n1 == d0) + { + q = GMP_NUMB_MASK; + cy = mpn_submul_1 (np - dn, dp - dn, dn, q); + ASSERT (cy == n2); + } + else + { + udiv_qr_3by2 (q, n1, n0, n2, n1, n0, d1, d0, dinv->inv32); + + if (dn > 2) + { + mp_limb_t cy, cy1; + cy = mpn_submul_1 (np - dn, dp - dn, dn - 2, q); + + cy1 = n0 < cy; + n0 = (n0 - cy) & GMP_NUMB_MASK; + cy = n1 < cy1; + n1 = (n1 - cy1) & GMP_NUMB_MASK; + np[-2] = n0; + + if (UNLIKELY (cy != 0)) + { + n1 += d1 + mpn_add_n (np - dn, np - dn, dp - dn, dn - 1); + qh -= (q == 0); + q = (q - 1) & GMP_NUMB_MASK; + } + } + else + np[-2] = n0; + + np[-1] = n1; + } + qp[0] = q; + } + else + { + if (qn == 2) + qh = mpn_divrem_2 (qp, 0L, np - 2, 4, dp - 2); + else if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD)) + qh = mpn_sbpi1_div_qr (qp, np - qn, 2 * qn, dp - qn, qn, dinv->inv32); + else + qh = mpn_dcpi1_div_qr_n (qp, np - qn, dp - qn, qn, dinv, tp); + + if (qn != dn) + { + if (qn > dn - qn) + mpn_mul (tp, qp, qn, dp - dn, dn - qn); + else + mpn_mul (tp, dp - dn, dn - qn, qp, qn); + + cy = mpn_sub_n (np - dn, np - dn, tp, dn); + if (qh != 0) + cy += mpn_sub_n (np - dn + qn, np - dn + qn, dp - dn, dn - qn); + + while (cy != 0) + { + qh -= mpn_sub_1 (qp, qp, qn, 1); + cy -= mpn_add_n (np - dn, np - dn, dp - dn, dn); + } + } + } + qn = nn - dn - qn + 1; + while (qn > dn) + { + qp -= dn; + np -= dn; + mpn_dcpi1_div_qr_n (qp, np - dn, dp - dn, dn, dinv, tp); + qn -= dn; + } + + /* Since we pretended we'd need an extra quotient limb before, we now + have made sure the code above left just dn-1=qn quotient limbs to + develop. Develop that plus a guard limb. */ + qn--; + qp -= qn; + np -= dn; + qsave = qp[qn]; + mpn_dcpi1_divappr_q_n (qp, np - dn, dp - dn, dn, dinv, tp); + MPN_COPY_INCR (qp, qp + 1, qn); + qp[qn] = qsave; + } + else /* (qn < dn) */ + { + mp_ptr q2p; +#if 0 /* not possible since we demand nn > dn */ + if (qn == 0) + { + qh = mpn_cmp (np - dn, dp - dn, dn) >= 0; + if (qh) + mpn_sub_n (np - dn, np - dn, dp - dn, dn); + TMP_FREE; + return qh; + } +#endif + + qp -= qn; /* point at low limb of next quotient block */ + np -= qn; /* point in the middle of partial remainder */ + + q2p = TMP_SALLOC_LIMBS (qn + 1); + /* Should we at all check DC_DIVAPPR_Q_THRESHOLD here, or reply on + callers not to be silly? */ + if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD)) + { + qh = mpn_sbpi1_divappr_q (q2p, np - qn - 2, 2 * (qn + 1), + dp - (qn + 1), qn + 1, dinv->inv32); + } + else + { + /* It is tempting to use qp for recursive scratch and put quotient in + tp, but the recursive scratch needs one limb too many. */ + tp = TMP_SALLOC_LIMBS (qn + 1); + qh = mpn_dcpi1_divappr_q_n (q2p, np - qn - 2, dp - (qn + 1), qn + 1, dinv, tp); + } + MPN_COPY (qp, q2p + 1, qn); + } + + TMP_FREE; + return qh; +} diff --git a/gmp-6.3.0/mpn/generic/div_q.c b/gmp-6.3.0/mpn/generic/div_q.c new file mode 100644 index 0000000..18c4ecf --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_q.c @@ -0,0 +1,313 @@ +/* mpn_div_q -- division for arbitrary size operands. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2009, 2010, 2015, 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* Compute Q = N/D with truncation. + N = {np,nn} + D = {dp,dn} + Q = {qp,nn-dn+1} + T = {scratch,nn+1} is scratch space + N and D are both untouched by the computation. + N and T may overlap; pass the same space if N is irrelevant after the call, + but note that tp needs an extra limb. + + Operand requirements: + N >= D > 0 + dp[dn-1] != 0 + No overlap between the N, D, and Q areas. + + This division function does not clobber its input operands, since it is + intended to support average-O(qn) division, and for that to be effective, it + cannot put requirements on callers to copy a O(nn) operand. + + If a caller does not care about the value of {np,nn+1} after calling this + function, it should pass np also for the scratch argument. This function + will then save some time and space by avoiding allocation and copying. + (FIXME: Is this a good design? We only really save any copying for + already-normalised divisors, which should be rare. It also prevents us from + reasonably asking for all scratch space we need.) + + We write nn-dn+1 limbs for the quotient, but return void. Why not return + the most significant quotient limb? Look at the 4 main code blocks below + (consisting of an outer if-else where each arm contains an if-else). It is + tricky for the first code block, since the mpn_*_div_q calls will typically + generate all nn-dn+1 and return 0 or 1. I don't see how to fix that unless + we generate the most significant quotient limb here, before calling + mpn_*_div_q, or put the quotient in a temporary area. Since this is a + critical division case (the SB sub-case in particular) copying is not a good + idea. + + It might make sense to split the if-else parts of the (qn + FUDGE + >= dn) blocks into separate functions, since we could promise quite + different things to callers in these two cases. The 'then' case + benefits from np=scratch, and it could perhaps even tolerate qp=np, + saving some headache for many callers. + + FIXME: Scratch allocation leaves a lot to be desired. E.g., for the MU size + operands, we do not reuse the huge scratch for adjustments. This can be a + serious waste of memory for the largest operands. +*/ + +/* FUDGE determines when to try getting an approximate quotient from the upper + parts of the dividend and divisor, then adjust. N.B. FUDGE must be >= 2 + for the code to be correct. */ +#define FUDGE 5 /* FIXME: tune this */ + +#define DC_DIV_Q_THRESHOLD DC_DIVAPPR_Q_THRESHOLD +#define MU_DIV_Q_THRESHOLD MU_DIVAPPR_Q_THRESHOLD +#define MUPI_DIV_Q_THRESHOLD MUPI_DIVAPPR_Q_THRESHOLD +#ifndef MUPI_DIVAPPR_Q_THRESHOLD +#define MUPI_DIVAPPR_Q_THRESHOLD MUPI_DIV_QR_THRESHOLD +#endif + +void +mpn_div_q (mp_ptr qp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, mp_ptr scratch) +{ + mp_ptr new_dp, new_np, tp, rp; + mp_limb_t cy, dh, qh; + mp_size_t new_nn, qn; + gmp_pi1_t dinv; + int cnt; + TMP_DECL; + TMP_MARK; + + ASSERT (nn >= dn); + ASSERT (dn > 0); + ASSERT (dp[dn - 1] != 0); + ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, np, nn)); + ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1, dp, dn)); + ASSERT (MPN_SAME_OR_SEPARATE_P (np, scratch, nn)); + + ASSERT_ALWAYS (FUDGE >= 2); + + dh = dp[dn - 1]; + if (dn == 1) + { + mpn_divrem_1 (qp, 0L, np, nn, dh); + return; + } + + qn = nn - dn + 1; /* Quotient size, high limb might be zero */ + + if (qn + FUDGE >= dn) + { + /* |________________________| + |_______| */ + new_np = scratch; + + if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0)) + { + count_leading_zeros (cnt, dh); + + cy = mpn_lshift (new_np, np, nn, cnt); + new_np[nn] = cy; + new_nn = nn + (cy != 0); + + new_dp = TMP_ALLOC_LIMBS (dn); + mpn_lshift (new_dp, dp, dn, cnt); + + if (dn == 2) + { + qh = mpn_divrem_2 (qp, 0L, new_np, new_nn, new_dp); + } + else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) || + BELOW_THRESHOLD (new_nn - dn, DC_DIV_Q_THRESHOLD)) + { + invert_pi1 (dinv, new_dp[dn - 1], new_dp[dn - 2]); + qh = mpn_sbpi1_div_q (qp, new_np, new_nn, new_dp, dn, dinv.inv32); + } + else if (BELOW_THRESHOLD (dn, MUPI_DIV_Q_THRESHOLD) || /* fast condition */ + BELOW_THRESHOLD (nn, 2 * MU_DIV_Q_THRESHOLD) || /* fast condition */ + (double) (2 * (MU_DIV_Q_THRESHOLD - MUPI_DIV_Q_THRESHOLD)) * dn /* slow... */ + + (double) MUPI_DIV_Q_THRESHOLD * nn > (double) dn * nn) /* ...condition */ + { + invert_pi1 (dinv, new_dp[dn - 1], new_dp[dn - 2]); + qh = mpn_dcpi1_div_q (qp, new_np, new_nn, new_dp, dn, &dinv); + } + else + { + mp_size_t itch = mpn_mu_div_q_itch (new_nn, dn, 0); + mp_ptr scratch = TMP_ALLOC_LIMBS (itch); + qh = mpn_mu_div_q (qp, new_np, new_nn, new_dp, dn, scratch); + } + if (cy == 0) + qp[qn - 1] = qh; + else + ASSERT (qh == 0); + } + else /* divisor is already normalised */ + { + if (new_np != np) + MPN_COPY (new_np, np, nn); + + if (dn == 2) + { + qh = mpn_divrem_2 (qp, 0L, new_np, nn, dp); + } + else if (BELOW_THRESHOLD (dn, DC_DIV_Q_THRESHOLD) || + BELOW_THRESHOLD (nn - dn, DC_DIV_Q_THRESHOLD)) + { + invert_pi1 (dinv, dh, dp[dn - 2]); + qh = mpn_sbpi1_div_q (qp, new_np, nn, dp, dn, dinv.inv32); + } + else if (BELOW_THRESHOLD (dn, MUPI_DIV_Q_THRESHOLD) || /* fast condition */ + BELOW_THRESHOLD (nn, 2 * MU_DIV_Q_THRESHOLD) || /* fast condition */ + (double) (2 * (MU_DIV_Q_THRESHOLD - MUPI_DIV_Q_THRESHOLD)) * dn /* slow... */ + + (double) MUPI_DIV_Q_THRESHOLD * nn > (double) dn * nn) /* ...condition */ + { + invert_pi1 (dinv, dh, dp[dn - 2]); + qh = mpn_dcpi1_div_q (qp, new_np, nn, dp, dn, &dinv); + } + else + { + mp_size_t itch = mpn_mu_div_q_itch (nn, dn, 0); + mp_ptr scratch = TMP_ALLOC_LIMBS (itch); + qh = mpn_mu_div_q (qp, np, nn, dp, dn, scratch); + } + qp[nn - dn] = qh; + } + } + else + { + /* |________________________| + |_________________| */ + tp = TMP_ALLOC_LIMBS (qn + 1); + + new_np = scratch; + new_nn = 2 * qn + 1; + if (new_np == np) + /* We need {np,nn} to remain untouched until the final adjustment, so + we need to allocate separate space for new_np. */ + new_np = TMP_ALLOC_LIMBS (new_nn + 1); + + + if (LIKELY ((dh & GMP_NUMB_HIGHBIT) == 0)) + { + count_leading_zeros (cnt, dh); + + cy = mpn_lshift (new_np, np + nn - new_nn, new_nn, cnt); + new_np[new_nn] = cy; + + new_nn += (cy != 0); + + new_dp = TMP_ALLOC_LIMBS (qn + 1); + mpn_lshift (new_dp, dp + dn - (qn + 1), qn + 1, cnt); + new_dp[0] |= dp[dn - (qn + 1) - 1] >> (GMP_NUMB_BITS - cnt); + + if (qn + 1 == 2) + { + qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp); + } + else if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD - 1)) + { + invert_pi1 (dinv, new_dp[qn], new_dp[qn - 1]); + qh = mpn_sbpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv.inv32); + } + else if (BELOW_THRESHOLD (qn, MU_DIVAPPR_Q_THRESHOLD - 1)) + { + invert_pi1 (dinv, new_dp[qn], new_dp[qn - 1]); + qh = mpn_dcpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, &dinv); + } + else + { + mp_size_t itch = mpn_mu_divappr_q_itch (new_nn, qn + 1, 0); + mp_ptr scratch = TMP_ALLOC_LIMBS (itch); + qh = mpn_mu_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, scratch); + } + if (cy == 0) + tp[qn] = qh; + else if (UNLIKELY (qh != 0)) + { + /* This happens only when the quotient is close to B^n and + mpn_*_divappr_q returned B^n. */ + mp_size_t i, n; + n = new_nn - (qn + 1); + for (i = 0; i < n; i++) + tp[i] = GMP_NUMB_MAX; + qh = 0; /* currently ignored */ + } + } + else /* divisor is already normalised */ + { + MPN_COPY (new_np, np + nn - new_nn, new_nn); /* pointless if MU will be used */ + + new_dp = (mp_ptr) dp + dn - (qn + 1); + + if (qn == 2 - 1) + { + qh = mpn_divrem_2 (tp, 0L, new_np, new_nn, new_dp); + } + else if (BELOW_THRESHOLD (qn, DC_DIVAPPR_Q_THRESHOLD - 1)) + { + invert_pi1 (dinv, dh, new_dp[qn - 1]); + qh = mpn_sbpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, dinv.inv32); + } + else if (BELOW_THRESHOLD (qn, MU_DIVAPPR_Q_THRESHOLD - 1)) + { + invert_pi1 (dinv, dh, new_dp[qn - 1]); + qh = mpn_dcpi1_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, &dinv); + } + else + { + mp_size_t itch = mpn_mu_divappr_q_itch (new_nn, qn + 1, 0); + mp_ptr scratch = TMP_ALLOC_LIMBS (itch); + qh = mpn_mu_divappr_q (tp, new_np, new_nn, new_dp, qn + 1, scratch); + } + tp[qn] = qh; + } + + MPN_COPY (qp, tp + 1, qn); + if (tp[0] <= 4) + { + mp_size_t rn; + + rp = TMP_ALLOC_LIMBS (dn + qn); + mpn_mul (rp, dp, dn, tp + 1, qn); + rn = dn + qn; + rn -= rp[rn - 1] == 0; + + if (rn > nn || mpn_cmp (np, rp, nn) < 0) + MPN_DECR_U (qp, qn, 1); + } + } + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/div_qr_1.c b/gmp-6.3.0/mpn/generic/div_qr_1.c new file mode 100644 index 0000000..8f80d37 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_1.c @@ -0,0 +1,125 @@ +/* mpn_div_qr_1 -- mpn by limb division. + + Contributed to the GNU project by Niels Möller and Torbjörn Granlund + +Copyright 1991, 1993, 1994, 1996, 1998-2000, 2002, 2003, 2013 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef DIV_QR_1_NORM_THRESHOLD +#define DIV_QR_1_NORM_THRESHOLD 3 +#endif +#ifndef DIV_QR_1_UNNORM_THRESHOLD +#define DIV_QR_1_UNNORM_THRESHOLD 3 +#endif + +#if GMP_NAIL_BITS > 0 +#error Nail bits not supported +#endif + +/* Divides {up, n} by d. Writes the n-1 low quotient limbs at {qp, + * n-1}, and the high quotient limb at *qh. Returns remainder. */ +mp_limb_t +mpn_div_qr_1 (mp_ptr qp, mp_limb_t *qh, mp_srcptr up, mp_size_t n, + mp_limb_t d) +{ + unsigned cnt; + mp_limb_t uh; + + ASSERT (n > 0); + ASSERT (d > 0); + + if (d & GMP_NUMB_HIGHBIT) + { + /* Normalized case */ + mp_limb_t dinv, q; + + uh = up[--n]; + + q = (uh >= d); + *qh = q; + uh -= (-q) & d; + + if (BELOW_THRESHOLD (n, DIV_QR_1_NORM_THRESHOLD)) + { + cnt = 0; + plain: + while (n > 0) + { + mp_limb_t ul = up[--n]; + udiv_qrnnd (qp[n], uh, uh, ul, d); + } + return uh >> cnt; + } + invert_limb (dinv, d); + return mpn_div_qr_1n_pi1 (qp, up, n, uh, d, dinv); + } + else + { + /* Unnormalized case */ + mp_limb_t dinv, ul; + + if (! UDIV_NEEDS_NORMALIZATION + && BELOW_THRESHOLD (n, DIV_QR_1_UNNORM_THRESHOLD)) + { + uh = up[--n]; + udiv_qrnnd (*qh, uh, CNST_LIMB(0), uh, d); + cnt = 0; + goto plain; + } + + count_leading_zeros (cnt, d); + d <<= cnt; + +#if HAVE_NATIVE_mpn_div_qr_1u_pi1 + /* FIXME: Call loop doing on-the-fly normalization */ +#endif + + /* Shift up front, use qp area for shifted copy. A bit messy, + since we have only n-1 limbs available, and shift the high + limb manually. */ + uh = up[--n]; + ul = (uh << cnt) | mpn_lshift (qp, up, n, cnt); + uh >>= (GMP_LIMB_BITS - cnt); + + if (UDIV_NEEDS_NORMALIZATION + && BELOW_THRESHOLD (n, DIV_QR_1_UNNORM_THRESHOLD)) + { + udiv_qrnnd (*qh, uh, uh, ul, d); + up = qp; + goto plain; + } + invert_limb (dinv, d); + + udiv_qrnnd_preinv (*qh, uh, uh, ul, d, dinv); + return mpn_div_qr_1n_pi1 (qp, qp, n, uh, d, dinv) >> cnt; + } +} diff --git a/gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c b/gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c new file mode 100644 index 0000000..4977131 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_1n_pi1.c @@ -0,0 +1,505 @@ +/* mpn_div_qr_1n_pi1 + + Contributed to the GNU project by Niels Möller + + THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#if GMP_NAIL_BITS > 0 +#error Nail bits not supported +#endif + +#ifndef DIV_QR_1N_METHOD +#define DIV_QR_1N_METHOD 2 +#endif + +/* FIXME: Duplicated in mod_1_1.c. Move to gmp-impl.h */ + +#if defined (__GNUC__) && ! defined (NO_ASM) + +#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "add %6, %k2\n\t" \ + "adc %4, %k1\n\t" \ + "sbb %k0, %k0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ + "%2" ((USItype)(a0)), "g" ((USItype)(b0))) +#endif + +#if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "add %6, %q2\n\t" \ + "adc %4, %q1\n\t" \ + "sbb %q0, %q0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) +#endif + +#if defined (__sparc__) && W_TYPE_SIZE == 32 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "addcc %r5, %6, %2\n\t" \ + "addxcc %r3, %4, %1\n\t" \ + "subx %%g0, %%g0, %0" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \ + __CLOBBER_CC) +#endif + +#if defined (__sparc__) && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "addcc %r5, %6, %2\n\t" \ + "addccc %r7, %8, %%g0\n\t" \ + "addccc %r3, %4, %1\n\t" \ + "clr %0\n\t" \ + "movcs %%xcc, -1, %0" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \ + "rJ" ((al) >> 32), "rI" ((bl) >> 32) \ + __CLOBBER_CC) +#if __VIS__ >= 0x300 +#undef add_mssaaaa +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "addcc %r5, %6, %2\n\t" \ + "addxccc %r3, %4, %1\n\t" \ + "clr %0\n\t" \ + "movcs %%xcc, -1, %0" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \ + __CLOBBER_CC) +#endif +#endif + +#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) +/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a + processor running in 32-bit mode, since the carry flag then gets the 32-bit + carry. */ +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "add%I6c %2, %5, %6\n\t" \ + "adde %1, %3, %4\n\t" \ + "subfe %0, %0, %0\n\t" \ + "nor %0, %0, %0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#if defined (__s390x__) && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "algr %2, %6\n\t" \ + "alcgr %1, %4\n\t" \ + "lghi %0, 0\n\t" \ + "alcgr %0, %0\n\t" \ + "lcgr %0, %0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "1" ((UDItype)(a1)), "r" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC) +#endif + +#if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "adds %2, %5, %6\n\t" \ + "adcs %1, %3, %4\n\t" \ + "movcc %0, #0\n\t" \ + "movcs %0, #-1" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC) +#endif + +#if defined (__aarch64__) && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "adds %2, %x5, %6\n\t" \ + "adcs %1, %x3, %x4\n\t" \ + "csinv %0, xzr, xzr, cc\n\t" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rZ" (ah), "rZ" (bh), "%rZ" (al), "rI" (bl) __CLOBBER_CC) +#endif +#endif /* defined (__GNUC__) */ + +#ifndef add_mssaaaa +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + do { \ + UWtype __s0, __s1, __c0, __c1; \ + __s0 = (a0) + (b0); \ + __s1 = (a1) + (b1); \ + __c0 = __s0 < (a0); \ + __c1 = __s1 < (a1); \ + (s0) = __s0; \ + __s1 = __s1 + __c0; \ + (s1) = __s1; \ + (m) = - (__c1 + (__s1 < __c0)); \ + } while (0) +#endif + +#if DIV_QR_1N_METHOD == 1 + +/* Divides (uh B^n + {up, n}) by d, storing the quotient at {qp, n}. + Requires that uh < d. */ +mp_limb_t +mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t uh, + mp_limb_t d, mp_limb_t dinv) +{ + ASSERT (n > 0); + ASSERT (uh < d); + ASSERT (d & GMP_NUMB_HIGHBIT); + ASSERT (MPN_SAME_OR_SEPARATE_P (qp, up, n)); + + do + { + mp_limb_t q, ul; + + ul = up[--n]; + udiv_qrnnd_preinv (q, uh, uh, ul, d, dinv); + qp[n] = q; + } + while (n > 0); + + return uh; +} + +#elif DIV_QR_1N_METHOD == 2 + +/* The main idea of this algorithm is to write B^2 = d (B + dinv) + + B2, where 1 <= B2 < d. Similarly to mpn_mod_1_1p, each iteration + can then replace + + u1 B^2 = u1 B2 (mod d) + + which gives a very short critical path for computing the remainder + (with some tricks to handle the carry when the next two lower limbs + are added in). To also get the quotient, include the corresponding + multiple of d in the expression, + + u1 B^2 = u1 B2 + (u1 dinv + u1 B) d + + We get the quotient by accumulating the (u1 dinv + u1 B) terms. The + two multiplies, u1 * B2 and u1 * dinv, are independent, and can be + executed in parallel. + */ +mp_limb_t +mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, + mp_limb_t d, mp_limb_t dinv) +{ + mp_limb_t B2; + mp_limb_t u0, u2; + mp_limb_t q0, q1; + mp_limb_t p0, p1; + mp_limb_t t; + mp_size_t j; + + ASSERT (d & GMP_LIMB_HIGHBIT); + ASSERT (n > 0); + ASSERT (u1 < d); + + if (n == 1) + { + udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv); + return u1; + } + + /* FIXME: Could be precomputed */ + B2 = -d*dinv; + + umul_ppmm (q1, q0, dinv, u1); + umul_ppmm (p1, p0, B2, u1); + q1 += u1; + ASSERT (q1 >= u1); + u0 = up[n-1]; /* Early read, to allow qp == up. */ + qp[n-1] = q1; + + add_mssaaaa (u2, u1, u0, u0, up[n-2], p1, p0); + + /* FIXME: Keep q1 in a variable between iterations, to reduce number + of memory accesses. */ + for (j = n-2; j-- > 0; ) + { + mp_limb_t q2, cy; + + /* Additions for the q update: + * +-------+ + * |u1 * v | + * +---+---+ + * | u1| + * +---+---+ + * | 1 | v | (conditional on u2) + * +---+---+ + * | 1 | (conditional on u0 + u2 B2 carry) + * +---+ + * + | q0| + * -+---+---+---+ + * | q2| q1| q0| + * +---+---+---+ + */ + umul_ppmm (p1, t, u1, dinv); + ADDC_LIMB (cy, u0, u0, u2 & B2); + u0 -= (-cy) & d; + add_ssaaaa (q2, q1, -u2, u2 & dinv, CNST_LIMB(0), u1); + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), q0); + q0 = t; + + /* Note that p1 + cy cannot overflow */ + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), p1 + cy); + + umul_ppmm (p1, p0, u1, B2); + + qp[j+1] = q1; + MPN_INCR_U (qp+j+2, n-j-2, q2); + + add_mssaaaa (u2, u1, u0, u0, up[j], p1, p0); + } + + q1 = (u2 > 0); + u1 -= (-q1) & d; + + t = (u1 >= d); + q1 += t; + u1 -= (-t) & d; + + udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv); + add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t); + + MPN_INCR_U (qp+1, n-1, q1); + + qp[0] = q0; + return u0; +} + +#elif DIV_QR_1N_METHOD == 3 + +/* This variant handles carry from the u update earlier. This gives a + longer critical path, but reduces the work needed for the + quotients. */ +mp_limb_t +mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, + mp_limb_t d, mp_limb_t dinv) +{ + mp_limb_t B2; + mp_limb_t cy, u0; + mp_limb_t q0, q1; + mp_limb_t p0, p1; + mp_limb_t t; + mp_size_t j; + + ASSERT (d & GMP_LIMB_HIGHBIT); + ASSERT (n > 0); + ASSERT (u1 < d); + + if (n == 1) + { + udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv); + return u1; + } + + /* FIXME: Could be precomputed */ + B2 = -d*dinv; + + umul_ppmm (q1, q0, dinv, u1); + umul_ppmm (p1, p0, B2, u1); + q1 += u1; + ASSERT (q1 >= u1); + u0 = up[n-1]; /* Early read, to allow qp == up. */ + + add_mssaaaa (cy, u1, u0, u0, up[n-2], p1, p0); + u1 -= cy & d; + q1 -= cy; + qp[n-1] = q1; + + /* FIXME: Keep q1 in a variable between iterations, to reduce number + of memory accesses. */ + for (j = n-2; j-- > 0; ) + { + mp_limb_t q2, cy; + mp_limb_t t1, t0; + + /* Additions for the q update: + * +-------+ + * |u1 * v | + * +---+---+ + * | u1| + * +---+ + * | 1 | (conditional on {u1, u0} carry) + * +---+ + * + | q0| + * -+---+---+---+ + * | q2| q1| q0| + * +---+---+---+ + * + * Additions for the u update: + * +-------+ + * |u1 * B2| + * +---+---+ + * + |u0 |u-1| + * +---+---+ + * - | d | (conditional on carry) + * ---+---+---+ + * |u1 | u0| + * +---+---+ + * + */ + umul_ppmm (p1, p0, u1, B2); + ADDC_LIMB (q2, q1, u1, q0); + umul_ppmm (t1, t0, u1, dinv); + add_mssaaaa (cy, u1, u0, u0, up[j], p1, p0); + u1 -= cy & d; + + /* t1 <= B-2, so cy can be added in without overflow. */ + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - cy); + q0 = t0; + + /* Final q update */ + qp[j+1] = q1; + MPN_INCR_U (qp+j+2, n-j-2, q2); + } + + q1 = (u1 >= d); + u1 -= (-q1) & d; + + udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv); + add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t); + + MPN_INCR_U (qp+1, n-1, q1); + + qp[0] = q0; + return u0; +} + +#elif DIV_QR_1N_METHOD == 4 + +mp_limb_t +mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1, + mp_limb_t d, mp_limb_t dinv) +{ + mp_limb_t B2; + mp_limb_t u2, u0; + mp_limb_t q0, q1; + mp_limb_t p0, p1; + mp_limb_t B2d0, B2d1; + mp_limb_t t; + mp_size_t j; + + ASSERT (d & GMP_LIMB_HIGHBIT); + ASSERT (n > 0); + ASSERT (u1 < d); + + if (n == 1) + { + udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv); + return u1; + } + + /* FIXME: Could be precomputed */ + B2 = -d*dinv; + /* B2 * (B-d) */ + umul_ppmm (B2d1, B2d0, B2, -d); + + umul_ppmm (q1, q0, dinv, u1); + umul_ppmm (p1, p0, B2, u1); + q1 += u1; + ASSERT (q1 >= u1); + + add_mssaaaa (u2, u1, u0, up[n-1], up[n-2], p1, p0); + + /* After read of up[n-1], to allow qp == up. */ + qp[n-1] = q1 - u2; + + /* FIXME: Keep q1 in a variable between iterations, to reduce number + of memory accesses. */ + for (j = n-2; j-- > 0; ) + { + mp_limb_t q2, cy; + mp_limb_t t1, t0; + + /* Additions for the q update. *After* u1 -= u2 & d adjustment. + * +-------+ + * |u1 * v | + * +---+---+ + * | u1| + * +---+ + * | 1 | (conditional on {u1, u0} carry) + * +---+ + * + | q0| + * -+---+---+---+ + * | q2| q1| q0| + * +---+---+---+ + * + * Additions for the u update. *Before* u1 -= u2 & d adjstment. + * +-------+ + * |u1 * B2| + * +---+---+ + * |u0 |u-1| + * +---+---+ + + + |B2(B-d)| (conditional on u2) + * -+---+---+---+ + * |u2 |u1 | u0| + * +---+---+---+ + * + */ + /* Multiply with unadjusted u1, to shorten critical path. */ + umul_ppmm (p1, p0, u1, B2); + u1 -= (d & u2); + ADDC_LIMB (q2, q1, u1, q0); + umul_ppmm (t1, t0, u1, dinv); + + add_mssaaaa (cy, u1, u0, u0, up[j], u2 & B2d1, u2 & B2d0); + add_mssaaaa (u2, u1, u0, u1, u0, p1, p0); + u2 += cy; + ASSERT(-u2 <= 1); + + /* t1 <= B-2, so u2 can be added in without overflow. */ + add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - u2); + q0 = t0; + + /* Final q update */ + qp[j+1] = q1; + MPN_INCR_U (qp+j+2, n-j-2, q2); + } + u1 -= u2 & d; + + q1 = (u1 >= d); + u1 -= (-q1) & d; + + udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv); + add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t); + + MPN_INCR_U (qp+1, n-1, q1); + + qp[0] = q0; + return u0; +} +#else +#error Unknown DIV_QR_1N_METHOD +#endif diff --git a/gmp-6.3.0/mpn/generic/div_qr_1n_pi2.c b/gmp-6.3.0/mpn/generic/div_qr_1n_pi2.c new file mode 100644 index 0000000..daae68f --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_1n_pi2.c @@ -0,0 +1,203 @@ +/* mpn_div_qr_1n_pi2. + + THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS + ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2013, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* ISSUES: + + * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv? + + * Are there any problems with generating n quotient limbs in the q area? It + surely simplifies things. + + * Not yet adequately tested. +*/ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Define some longlong.h-style macros, but for wider operations. + * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into + an additional sum operand. +*/ +#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM) + +#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((USItype)(s2)), \ + "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ + "%2" ((USItype)(a0)), "g" ((USItype)(b0))) +#endif + +#if defined (__amd64__) && W_TYPE_SIZE == 64 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((UDItype)(s2)), \ + "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) +#endif + +#if defined (__aarch64__) && W_TYPE_SIZE == 64 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %x3, xzr"\ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "rZ" (s2), "%rZ" (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) +/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a + processor running in 32-bit mode, since the carry flag then gets the 32-bit + carry. */ +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#endif /* __GNUC__ */ + +#ifndef add_sssaaaa +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + do { \ + UWtype __s0, __s1, __c0, __c1; \ + __s0 = (a0) + (b0); \ + __s1 = (a1) + (b1); \ + __c0 = __s0 < (a0); \ + __c1 = __s1 < (a1); \ + (s0) = __s0; \ + __s1 = __s1 + __c0; \ + (s1) = __s1; \ + (s2) += __c1 + (__s1 < __c0); \ + } while (0) +#endif + +struct precomp_div_1_pi2 +{ + mp_limb_t dip[2]; + mp_limb_t d; + int norm_cnt; +}; + +mp_limb_t +mpn_div_qr_1n_pi2 (mp_ptr qp, + mp_srcptr up, mp_size_t un, + struct precomp_div_1_pi2 *pd) +{ + mp_limb_t most_significant_q_limb; + mp_size_t i; + mp_limb_t r, u2, u1, u0; + mp_limb_t d0, di1, di0; + mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d; + mp_limb_t cnd; + + ASSERT (un >= 2); + ASSERT ((pd->d & GMP_NUMB_HIGHBIT) != 0); + ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up); + ASSERT_MPN (up, un); + +#define q3 q3a +#define q2 q2b +#define q1 q1b + + up += un - 3; + r = up[2]; + d0 = pd->d; + + most_significant_q_limb = (r >= d0); + r -= d0 & -most_significant_q_limb; + + qp += un - 3; + qp[2] = most_significant_q_limb; + + di1 = pd->dip[1]; + di0 = pd->dip[0]; + + for (i = un - 3; i >= 0; i -= 2) + { + u2 = r; + u1 = up[1]; + u0 = up[0]; + + /* Dividend in {r,u1,u0} */ + + umul_ppmm (q1d,q0d, u1, di0); + umul_ppmm (q2b,q1b, u1, di1); + q2b++; /* cannot spill */ + add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0); + + umul_ppmm (q2c,q1c, u2, di0); + add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c); + umul_ppmm (q3a,q2a, u2, di1); + + add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d); + + q3 += r; + + r = u0 - q2 * d0; + + cnd = (r >= q1); + r += d0 & -cnd; + sub_ddmmss (q3,q2, q3,q2, 0,cnd); + + if (UNLIKELY (r >= d0)) + { + r -= d0; + add_ssaaaa (q3,q2, q3,q2, 0,1); + } + + qp[0] = q2; + qp[1] = q3; + + up -= 2; + qp -= 2; + } + + if ((un & 1) == 0) + { + u2 = r; + u1 = up[1]; + + udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1); + qp[1] = q3; + } + + return r; + +#undef q3 +#undef q2 +#undef q1 +} diff --git a/gmp-6.3.0/mpn/generic/div_qr_1u_pi2.c b/gmp-6.3.0/mpn/generic/div_qr_1u_pi2.c new file mode 100644 index 0000000..ea38e3c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_1u_pi2.c @@ -0,0 +1,236 @@ +/* mpn_div_qr_1u_pi2. + + THIS FILE CONTAINS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS + ONLY SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2013, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* ISSUES: + + * Can we really use the high pi2 inverse limb for udiv_qrnnd_preinv? + + * Are there any problems with generating n quotient limbs in the q area? It + surely simplifies things. + + * Not yet adequately tested. +*/ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Define some longlong.h-style macros, but for wider operations. + * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into + an additional sum operand. +*/ +#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM) + +#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((USItype)(s2)), \ + "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ + "%2" ((USItype)(a0)), "g" ((USItype)(b0))) +#endif + +#if defined (__amd64__) && W_TYPE_SIZE == 64 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((UDItype)(s2)), \ + "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) +#endif + +#if defined (__aarch64__) && W_TYPE_SIZE == 64 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %x3, xzr"\ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "rZ" (s2), "%rZ" (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) +/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a + processor running in 32-bit mode, since the carry flag then gets the 32-bit + carry. */ +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#endif /* __GNUC__ */ + +#ifndef add_sssaaaa +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + do { \ + UWtype __s0, __s1, __c0, __c1; \ + __s0 = (a0) + (b0); \ + __s1 = (a1) + (b1); \ + __c0 = __s0 < (a0); \ + __c1 = __s1 < (a1); \ + (s0) = __s0; \ + __s1 = __s1 + __c0; \ + (s1) = __s1; \ + (s2) += __c1 + (__s1 < __c0); \ + } while (0) +#endif + +struct precomp_div_1_pi2 +{ + mp_limb_t dip[2]; + mp_limb_t d; + int norm_cnt; +}; + +mp_limb_t +mpn_div_qr_1u_pi2 (mp_ptr qp, + mp_srcptr up, mp_size_t un, + struct precomp_div_1_pi2 *pd) +{ + mp_size_t i; + mp_limb_t r, u2, u1, u0; + mp_limb_t d0, di1, di0; + mp_limb_t q3a, q2a, q2b, q1b, q2c, q1c, q1d, q0d; + mp_limb_t cnd; + int cnt; + + ASSERT (un >= 2); + ASSERT ((pd->d & GMP_NUMB_HIGHBIT) == 0); + ASSERT (! MPN_OVERLAP_P (qp, un-2, up, un) || qp+2 >= up); + ASSERT_MPN (up, un); + +#define q3 q3a +#define q2 q2b +#define q1 q1b + + up += un - 3; + cnt = pd->norm_cnt; + r = up[2] >> (GMP_NUMB_BITS - cnt); + d0 = pd->d << cnt; + + qp += un - 2; + + di1 = pd->dip[1]; + di0 = pd->dip[0]; + + for (i = un - 3; i >= 0; i -= 2) + { + u2 = r; + u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt)); + u0 = (up[1] << cnt) | (up[0] >> (GMP_NUMB_BITS - cnt)); + + /* Dividend in {r,u1,u0} */ + + umul_ppmm (q1d,q0d, u1, di0); + umul_ppmm (q2b,q1b, u1, di1); + q2b++; /* cannot spill */ + add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0); + + umul_ppmm (q2c,q1c, u2, di0); + add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c); + umul_ppmm (q3a,q2a, u2, di1); + + add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d); + + q3 += r; + + r = u0 - q2 * d0; + + cnd = (r >= q1); + r += d0 & -cnd; + sub_ddmmss (q3,q2, q3,q2, 0,cnd); + + if (UNLIKELY (r >= d0)) + { + r -= d0; + add_ssaaaa (q3,q2, q3,q2, 0,1); + } + + qp[0] = q2; + qp[1] = q3; + + up -= 2; + qp -= 2; + } + + if ((un & 1) != 0) + { + u2 = r; + u1 = (up[2] << cnt); + + udiv_qrnnd_preinv (q3, r, u2, u1, d0, di1); + qp[1] = q3; + } + else + { + u2 = r; + u1 = (up[2] << cnt) | (up[1] >> (GMP_NUMB_BITS - cnt)); + u0 = (up[1] << cnt); + + /* Dividend in {r,u1,u0} */ + + umul_ppmm (q1d,q0d, u1, di0); + umul_ppmm (q2b,q1b, u1, di1); + q2b++; /* cannot spill */ + add_sssaaaa (r,q2b,q1b, q2b,q1b, u1,u0); + + umul_ppmm (q2c,q1c, u2, di0); + add_sssaaaa (r,q2b,q1b, q2b,q1b, q2c,q1c); + umul_ppmm (q3a,q2a, u2, di1); + + add_sssaaaa (r,q2b,q1b, q2b,q1b, q2a,q1d); + + q3 += r; + + r = u0 - q2 * d0; + + cnd = (r >= q1); + r += d0 & -cnd; + sub_ddmmss (q3,q2, q3,q2, 0,cnd); + + if (UNLIKELY (r >= d0)) + { + r -= d0; + add_ssaaaa (q3,q2, q3,q2, 0,1); + } + + qp[0] = q2; + qp[1] = q3; + } + + return r >> cnt; + +#undef q3 +#undef q2 +#undef q1 +} diff --git a/gmp-6.3.0/mpn/generic/div_qr_2.c b/gmp-6.3.0/mpn/generic/div_qr_2.c new file mode 100644 index 0000000..c3c8f57 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_2.c @@ -0,0 +1,314 @@ +/* mpn_div_qr_2 -- Divide natural numbers, producing both remainder and + quotient. The divisor is two limbs. + + Contributed to the GNU project by Torbjorn Granlund and Niels Möller + + THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright 1993-1996, 1999-2002, 2011, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef DIV_QR_2_PI2_THRESHOLD +/* Disabled unless explicitly tuned. */ +#define DIV_QR_2_PI2_THRESHOLD MP_LIMB_T_MAX +#endif + +#ifndef SANITY_CHECK +#define SANITY_CHECK 0 +#endif + +/* Define some longlong.h-style macros, but for wider operations. + * add_sssaaaa is like longlong.h's add_ssaaaa but propagating carry-out into + an additional sum operand. + * add_csaac accepts two addends and a carry in, and generates a sum and a + carry out. A little like a "full adder". +*/ +#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (NO_ASM) + +#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add\t%7, %k2\n\tadc\t%5, %k1\n\tadc\t$0, %k0" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((USItype)(s2)), \ + "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ + "%2" ((USItype)(a0)), "g" ((USItype)(b0))) +#endif + +#if defined (__amd64__) && W_TYPE_SIZE == 64 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add\t%7, %q2\n\tadc\t%5, %q1\n\tadc\t$0, %q0" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "0" ((UDItype)(s2)), \ + "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) +#endif + +#if defined (__aarch64__) && W_TYPE_SIZE == 64 +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("adds\t%2, %x6, %7\n\tadcs\t%1, %x4, %x5\n\tadc\t%0, %x3, xzr"\ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "rZ" (s2), "%rZ" (a1), "rZ" (b1), "%rZ" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) +/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a + processor running in 32-bit mode, since the carry flag then gets the 32-bit + carry. */ +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + __asm__ ("add%I7c\t%2,%6,%7\n\tadde\t%1,%4,%5\n\taddze\t%0,%3" \ + : "=r" (s2), "=&r" (s1), "=&r" (s0) \ + : "r" (s2), "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#endif /* __GNUC__ */ + +#ifndef add_sssaaaa +#define add_sssaaaa(s2, s1, s0, a1, a0, b1, b0) \ + do { \ + UWtype __s0, __s1, __c0, __c1; \ + __s0 = (a0) + (b0); \ + __s1 = (a1) + (b1); \ + __c0 = __s0 < (a0); \ + __c1 = __s1 < (a1); \ + (s0) = __s0; \ + __s1 = __s1 + __c0; \ + (s1) = __s1; \ + (s2) += __c1 + (__s1 < __c0); \ + } while (0) +#endif + +/* Typically used with r1, r0 same as n3, n2. Other types of overlap + between inputs and outputs are not supported. */ +#define udiv_qr_4by2(q1,q0, r1,r0, n3,n2,n1,n0, d1,d0, di1,di0) \ + do { \ + mp_limb_t _q3, _q2a, _q2, _q1, _q2c, _q1c, _q1d, _q0; \ + mp_limb_t _t1, _t0; \ + mp_limb_t _mask; \ + \ + /* [q3,q2,q1,q0] = [n3,n2]*[di1,di0] + [n3,n2,n1,n0] + [0,1,0,0] */ \ + umul_ppmm (_q2,_q1, n2, di1); \ + umul_ppmm (_q3,_q2a, n3, di1); \ + ++_q2; /* _q2 cannot overflow */ \ + add_ssaaaa (_q3,_q2, _q3,_q2, n3,_q2a); \ + umul_ppmm (_q2c,_q1c, n3, di0); \ + add_sssaaaa (_q3,_q2,_q1, _q2,_q1, n2,_q1c); \ + umul_ppmm (_q1d,_q0, n2, di0); \ + add_sssaaaa (_q2c,_q1,_q0, _q1,_q0, n1,n0); /* _q2c cannot overflow */ \ + add_sssaaaa (_q3,_q2,_q1, _q2,_q1, _q2c,_q1d); \ + \ + umul_ppmm (_t1,_t0, _q2, d0); \ + _t1 += _q2 * d1 + _q3 * d0; \ + \ + sub_ddmmss (r1, r0, n1, n0, _t1, _t0); \ + \ + _mask = -(mp_limb_t) ((r1 >= _q1) & ((r1 > _q1) | (r0 >= _q0))); /* (r1,r0) >= (q1,q0) */ \ + add_ssaaaa (r1, r0, r1, r0, d1 & _mask, d0 & _mask); \ + sub_ddmmss (_q3, _q2, _q3, _q2, CNST_LIMB(0), -_mask); \ + \ + if (UNLIKELY (r1 >= d1)) \ + { \ + if (r1 > d1 || r0 >= d0) \ + { \ + sub_ddmmss (r1, r0, r1, r0, d1, d0); \ + add_ssaaaa (_q3, _q2, _q3, _q2, CNST_LIMB(0), CNST_LIMB(1));\ + } \ + } \ + (q1) = _q3; \ + (q0) = _q2; \ + } while (0) + +static void +invert_4by2 (mp_ptr di, mp_limb_t d1, mp_limb_t d0) +{ + mp_limb_t v1, v0, p1, t1, t0, p0, mask; + invert_limb (v1, d1); + p1 = d1 * v1; + /* <1, v1> * d1 = */ + p1 += d0; + if (p1 < d0) + { + v1--; + mask = -(mp_limb_t) (p1 >= d1); + p1 -= d1; + v1 += mask; + p1 -= mask & d1; + } + /* <1, v1> * d1 + d0 = */ + umul_ppmm (t1, p0, d0, v1); + p1 += t1; + if (p1 < t1) + { + if (UNLIKELY (p1 >= d1)) + { + if (p1 > d1 || p0 >= d0) + { + sub_ddmmss (p1, p0, p1, p0, d1, d0); + v1--; + } + } + sub_ddmmss (p1, p0, p1, p0, d1, d0); + v1--; + } + /* Now v1 is the 3/2 inverse, <1, v1> * = , + * with + >= B^2. + * + * The 4/2 inverse is (B^4 - 1) / = <1, v1, v0>. The + * partial remainder after <1, v1> is + * + * B^4 - 1 - B <1, v1> = - + * = <~p1, ~p0, B-1> + */ + udiv_qr_3by2 (v0, t1, t0, ~p1, ~p0, MP_LIMB_T_MAX, d1, d0, v1); + di[0] = v0; + di[1] = v1; + +#if SANITY_CHECK + { + mp_limb_t tp[4]; + mp_limb_t dp[2]; + dp[0] = d0; + dp[1] = d1; + mpn_mul_n (tp, dp, di, 2); + ASSERT_ALWAYS (mpn_add_n (tp+2, tp+2, dp, 2) == 0); + ASSERT_ALWAYS (tp[2] == MP_LIMB_T_MAX); + ASSERT_ALWAYS (tp[3] == MP_LIMB_T_MAX); + ASSERT_ALWAYS (mpn_add_n (tp, tp, dp, 2) == 1); + } +#endif +} + +static mp_limb_t +mpn_div_qr_2n_pi2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn, + mp_limb_t d1, mp_limb_t d0, mp_limb_t di1, mp_limb_t di0) +{ + mp_limb_t qh; + mp_size_t i; + mp_limb_t r1, r0; + + ASSERT (nn >= 2); + ASSERT (d1 & GMP_NUMB_HIGHBIT); + + r1 = np[nn-1]; + r0 = np[nn-2]; + + qh = 0; + if (r1 >= d1 && (r1 > d1 || r0 >= d0)) + { +#if GMP_NAIL_BITS == 0 + sub_ddmmss (r1, r0, r1, r0, d1, d0); +#else + r0 = r0 - d0; + r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1); + r0 &= GMP_NUMB_MASK; +#endif + qh = 1; + } + + for (i = nn - 2; i >= 2; i -= 2) + { + mp_limb_t n1, n0, q1, q0; + n1 = np[i-1]; + n0 = np[i-2]; + udiv_qr_4by2 (q1, q0, r1, r0, r1, r0, n1, n0, d1, d0, di1, di0); + qp[i-1] = q1; + qp[i-2] = q0; + } + + if (i > 0) + { + mp_limb_t q; + udiv_qr_3by2 (q, r1, r0, r1, r0, np[0], d1, d0, di1); + qp[0] = q; + } + rp[1] = r1; + rp[0] = r0; + + return qh; +} + + +/* Divide num {np,nn} by den {dp,2} and write the nn-2 least + significant quotient limbs at qp and the 2 long remainder at np. + Return the most significant limb of the quotient. + + Preconditions: + 1. qp must either not overlap with the other operands at all, or + qp >= np + 2 must hold true. (This means that it's possible to put + the quotient in the high part of {np,nn}, right above the remainder.) + 2. nn >= 2. */ + +mp_limb_t +mpn_div_qr_2 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn, + mp_srcptr dp) +{ + mp_limb_t d1; + mp_limb_t d0; + gmp_pi1_t dinv; + + ASSERT (nn >= 2); + ASSERT (! MPN_OVERLAP_P (qp, nn-2, np, nn) || qp >= np + 2); + ASSERT_MPN (np, nn); + ASSERT_MPN (dp, 2); + + d1 = dp[1]; d0 = dp[0]; + + ASSERT (d1 > 0); + + if (UNLIKELY (d1 & GMP_NUMB_HIGHBIT)) + { + if (BELOW_THRESHOLD (nn, DIV_QR_2_PI2_THRESHOLD)) + { + gmp_pi1_t dinv; + invert_pi1 (dinv, d1, d0); + return mpn_div_qr_2n_pi1 (qp, rp, np, nn, d1, d0, dinv.inv32); + } + else + { + mp_limb_t di[2]; + invert_4by2 (di, d1, d0); + return mpn_div_qr_2n_pi2 (qp, rp, np, nn, d1, d0, di[1], di[0]); + } + } + else + { + int shift; + count_leading_zeros (shift, d1); + d1 = (d1 << shift) | (d0 >> (GMP_LIMB_BITS - shift)); + d0 <<= shift; + invert_pi1 (dinv, d1, d0); + return mpn_div_qr_2u_pi1 (qp, rp, np, nn, d1, d0, shift, dinv.inv32); + } +} diff --git a/gmp-6.3.0/mpn/generic/div_qr_2n_pi1.c b/gmp-6.3.0/mpn/generic/div_qr_2n_pi1.c new file mode 100644 index 0000000..131a811 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_2n_pi1.c @@ -0,0 +1,84 @@ +/* mpn_div_qr_2n_pi1 + + Contributed to the GNU project by Torbjorn Granlund and Niels Möller + + THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright 1993-1996, 1999-2002, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* 3/2 loop, for normalized divisor */ +mp_limb_t +mpn_div_qr_2n_pi1 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn, + mp_limb_t d1, mp_limb_t d0, mp_limb_t di) +{ + mp_limb_t qh; + mp_size_t i; + mp_limb_t r1, r0; + + ASSERT (nn >= 2); + ASSERT (d1 & GMP_NUMB_HIGHBIT); + + np += nn - 2; + r1 = np[1]; + r0 = np[0]; + + qh = 0; + if (r1 >= d1 && (r1 > d1 || r0 >= d0)) + { +#if GMP_NAIL_BITS == 0 + sub_ddmmss (r1, r0, r1, r0, d1, d0); +#else + r0 = r0 - d0; + r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1); + r0 &= GMP_NUMB_MASK; +#endif + qh = 1; + } + + for (i = nn - 2 - 1; i >= 0; i--) + { + mp_limb_t n0, q; + n0 = np[-1]; + udiv_qr_3by2 (q, r1, r0, r1, r0, n0, d1, d0, di); + np--; + qp[i] = q; + } + + rp[1] = r1; + rp[0] = r0; + + return qh; +} diff --git a/gmp-6.3.0/mpn/generic/div_qr_2u_pi1.c b/gmp-6.3.0/mpn/generic/div_qr_2u_pi1.c new file mode 100644 index 0000000..70e617b --- /dev/null +++ b/gmp-6.3.0/mpn/generic/div_qr_2u_pi1.c @@ -0,0 +1,76 @@ +/* mpn_div_qr_2u_pi1 + + Contributed to the GNU project by Niels Möller + + THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* 3/2 loop, for unnormalized divisor. Caller must pass shifted d1 and + d0, while {np,nn} is shifted on the fly. */ +mp_limb_t +mpn_div_qr_2u_pi1 (mp_ptr qp, mp_ptr rp, mp_srcptr np, mp_size_t nn, + mp_limb_t d1, mp_limb_t d0, int shift, mp_limb_t di) +{ + mp_limb_t qh; + mp_limb_t r2, r1, r0; + mp_size_t i; + + ASSERT (nn >= 2); + ASSERT (d1 & GMP_NUMB_HIGHBIT); + ASSERT (shift > 0); + + r2 = np[nn-1] >> (GMP_LIMB_BITS - shift); + r1 = (np[nn-1] << shift) | (np[nn-2] >> (GMP_LIMB_BITS - shift)); + r0 = np[nn-2] << shift; + + udiv_qr_3by2 (qh, r2, r1, r2, r1, r0, d1, d0, di); + + for (i = nn - 2 - 1; i >= 0; i--) + { + mp_limb_t q; + r0 = np[i]; + r1 |= r0 >> (GMP_LIMB_BITS - shift); + r0 <<= shift; + udiv_qr_3by2 (q, r2, r1, r2, r1, r0, d1, d0, di); + qp[i] = q; + } + + rp[0] = (r1 >> shift) | (r2 << (GMP_LIMB_BITS - shift)); + rp[1] = r2 >> shift; + + return qh; +} diff --git a/gmp-6.3.0/mpn/generic/dive_1.c b/gmp-6.3.0/mpn/generic/dive_1.c new file mode 100644 index 0000000..056f5b9 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/dive_1.c @@ -0,0 +1,146 @@ +/* mpn_divexact_1 -- mpn by limb exact division. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2000-2003, 2005, 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + + +/* Divide a={src,size} by d=divisor and store the quotient in q={dst,size}. + q will only be correct if d divides a exactly. + + A separate loop is used for shift==0 because n<s)" and let the caller do a final umul if interested. + + When the divisor is even, the factors of two could be handled with a + separate mpn_rshift, instead of shifting on the fly. That might be + faster on some CPUs and would mean just the shift==0 style loop would be + needed. + + If n<= 1); + ASSERT (divisor != 0); + ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size)); + ASSERT_MPN (src, size); + ASSERT_LIMB (divisor); + + if ((divisor & 1) == 0) + { + count_trailing_zeros (shift, divisor); + divisor >>= shift; + } + else + shift = 0; + + binvert_limb (inverse, divisor); + divisor <<= GMP_NAIL_BITS; + + if (shift != 0) + { + c = 0; + + s = src[0]; + + for (i = 1; i < size; i++) + { + s_next = src[i]; + ls = ((s >> shift) | (s_next << (GMP_NUMB_BITS-shift))) & GMP_NUMB_MASK; + s = s_next; + + SUBC_LIMB (c, l, ls, c); + + l = (l * inverse) & GMP_NUMB_MASK; + dst[i - 1] = l; + + umul_ppmm (h, dummy, l, divisor); + c += h; + } + + ls = s >> shift; + l = ls - c; + l = (l * inverse) & GMP_NUMB_MASK; + dst[size - 1] = l; + } + else + { + s = src[0]; + + l = (s * inverse) & GMP_NUMB_MASK; + dst[0] = l; + c = 0; + + for (i = 1; i < size; i++) + { + umul_ppmm (h, dummy, l, divisor); + c += h; + + s = src[i]; + SUBC_LIMB (c, l, s, c); + + l = (l * inverse) & GMP_NUMB_MASK; + dst[i] = l; + } + } +} diff --git a/gmp-6.3.0/mpn/generic/diveby3.c b/gmp-6.3.0/mpn/generic/diveby3.c new file mode 100644 index 0000000..7dee0bc --- /dev/null +++ b/gmp-6.3.0/mpn/generic/diveby3.c @@ -0,0 +1,173 @@ +/* mpn_divexact_by3c -- mpn exact division by 3. + +Copyright 2000-2003, 2008 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#if DIVEXACT_BY3_METHOD == 0 + +mp_limb_t +mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c) +{ + mp_limb_t r; + r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c); + + /* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3. + We want to return C. We compute the remainder mod 4 and notice that the + inverse of (2^(2k)-1)/3 mod 4 is 1. */ + return r & 3; +} + +#endif + +#if DIVEXACT_BY3_METHOD == 1 + +/* The algorithm here is basically the same as mpn_divexact_1, as described + in the manual. Namely at each step q = (src[i]-c)*inverse, and new c = + borrow(src[i]-c) + high(divisor*q). But because the divisor is just 3, + high(divisor*q) can be determined with two comparisons instead of a + multiply. + + The "c += ..."s add the high limb of 3*l to c. That high limb will be 0, + 1 or 2. Doing two separate "+="s seems to give better code on gcc (as of + 2.95.2 at least). + + It will be noted that the new c is formed by adding three values each 0 + or 1. But the total is only 0, 1 or 2. When the subtraction src[i]-c + causes a borrow, that leaves a limb value of either 0xFF...FF or + 0xFF...FE. The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or + 0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1 + respectively, hence a total of no more than 2. + + Alternatives: + + This implementation has each multiply on the dependent chain, due to + "l=s-c". See below for alternative code which avoids that. */ + +mp_limb_t +mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c) +{ + mp_limb_t l, q, s; + mp_size_t i; + + ASSERT (un >= 1); + ASSERT (c == 0 || c == 1 || c == 2); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un)); + + i = 0; + do + { + s = up[i]; + SUBC_LIMB (c, l, s, c); + + q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK; + rp[i] = q; + + c += (q >= GMP_NUMB_CEIL_MAX_DIV3); + c += (q >= GMP_NUMB_CEIL_2MAX_DIV3); + } + while (++i < un); + + ASSERT (c == 0 || c == 1 || c == 2); + return c; +} + + +#endif + +#if DIVEXACT_BY3_METHOD == 2 + +/* The following alternative code re-arranges the quotient calculation from + (src[i]-c)*inverse to instead + + q = src[i]*inverse - c*inverse + + thereby allowing src[i]*inverse to be scheduled back as far as desired, + making full use of multiplier throughput and leaving just some carry + handing on the dependent chain. + + The carry handling consists of determining the c for the next iteration. + This is the same as described above, namely look for any borrow from + src[i]-c, and at the high of 3*q. + + high(3*q) is done with two comparisons as above (in c2 and c3). The + borrow from src[i]-c is incorporated into those by noting that if there's + a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn + giving q = 0x55..55 or 0xAA..AA. Adding 1 to either of those q values is + enough to make high(3*q) come out 1 bigger, as required. + + l = -c*inverse is calculated at the same time as c, since for most chips + it can be more conveniently derived from separate c1/c2/c3 values than + from a combined c equal to 0, 1 or 2. + + The net effect is that with good pipelining this loop should be able to + run at perhaps 4 cycles/limb, depending on available execute resources + etc. + + Usage: + + This code is not used by default, since we really can't rely on the + compiler generating a good software pipeline, nor on such an approach + even being worthwhile on all CPUs. + + Itanium is one chip where this algorithm helps though, see + mpn/ia64/diveby3.asm. */ + +mp_limb_t +mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy) +{ + mp_limb_t s, sm, cl, q, qx, c2, c3; + mp_size_t i; + + ASSERT (un >= 1); + ASSERT (cy == 0 || cy == 1 || cy == 2); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un)); + + cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3; + + for (i = 0; i < un; i++) + { + s = up[i]; + sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK; + + q = (cl + sm) & GMP_NUMB_MASK; + rp[i] = q; + qx = q + (s < cy); + + c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3; + c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ; + + cy = c2 + c3; + cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3); + } + + return cy; +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/divexact.c b/gmp-6.3.0/mpn/generic/divexact.c new file mode 100644 index 0000000..ec417df --- /dev/null +++ b/gmp-6.3.0/mpn/generic/divexact.c @@ -0,0 +1,296 @@ +/* mpn_divexact(qp,np,nn,dp,dn,tp) -- Divide N = {np,nn} by D = {dp,dn} storing + the result in Q = {qp,nn-dn+1} expecting no remainder. Overlap allowed + between Q and N; all other overlap disallowed. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2007, 2009, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + +#if 1 +void +mpn_divexact (mp_ptr qp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn) +{ + unsigned shift; + mp_size_t qn; + mp_ptr tp; + TMP_DECL; + + ASSERT (dn > 0); + ASSERT (nn >= dn); + ASSERT (dp[dn-1] > 0); + + while (dp[0] == 0) + { + ASSERT (np[0] == 0); + dp++; + np++; + dn--; + nn--; + } + + if (dn == 1) + { + MPN_DIVREM_OR_DIVEXACT_1 (qp, np, nn, dp[0]); + return; + } + + TMP_MARK; + + qn = nn + 1 - dn; + count_trailing_zeros (shift, dp[0]); + + if (shift > 0) + { + mp_ptr wp; + mp_size_t ss; + ss = (dn > qn) ? qn + 1 : dn; + + tp = TMP_ALLOC_LIMBS (ss); + mpn_rshift (tp, dp, ss, shift); + dp = tp; + + /* Since we have excluded dn == 1, we have nn > qn, and we need + to shift one limb beyond qn. */ + wp = TMP_ALLOC_LIMBS (qn + 1); + mpn_rshift (wp, np, qn + 1, shift); + np = wp; + } + + if (dn > qn) + dn = qn; + + tp = TMP_ALLOC_LIMBS (mpn_bdiv_q_itch (qn, dn)); + mpn_bdiv_q (qp, np, qn, dp, dn, tp); + TMP_FREE; + + /* Since bdiv_q computes -N/D (mod B^{qn}), we must negate now. */ + mpn_neg (qp, qp, qn); +} + +#else + +/* We use the Jebelean's bidirectional exact division algorithm. This is + somewhat naively implemented, with equal quotient parts done by 2-adic + division and truncating division. Since 2-adic division is faster, it + should be used for a larger chunk. + + This code is horrendously ugly, in all sorts of ways. + + * It was hacked without much care or thought, but with a testing program. + * It handles scratch space frivolously, and furthermore the itch function + is broken. + * Doesn't provide any measures to deal with mu_divappr_q's +3 error. We + have yet to provoke an error due to this, though. + * Algorithm selection leaves a lot to be desired. In particular, the choice + between DC and MU isn't a point, but we treat it like one. + * It makes the msb part 1 or 2 limbs larger than the lsb part, in spite of + that the latter is faster. We should at least reverse this, but perhaps + we should make the lsb part considerably larger. (How do we tune this?) +*/ + +mp_size_t +mpn_divexact_itch (mp_size_t nn, mp_size_t dn) +{ + return nn + dn; /* FIXME this is not right */ +} + +void +mpn_divexact (mp_ptr qp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr scratch) +{ + mp_size_t qn; + mp_size_t nn0, qn0; + mp_size_t nn1, qn1; + mp_ptr tp; + mp_limb_t qml; + mp_limb_t qh; + int cnt; + mp_ptr xdp; + mp_limb_t di; + mp_limb_t cy; + gmp_pi1_t dinv; + TMP_DECL; + + TMP_MARK; + + qn = nn - dn + 1; + + /* For small divisors, and small quotients, don't use Jebelean's algorithm. */ + if (dn < DIVEXACT_JEB_THRESHOLD || qn < DIVEXACT_JEB_THRESHOLD) + { + tp = scratch; + MPN_COPY (tp, np, qn); + binvert_limb (di, dp[0]); di = -di; + dn = MIN (dn, qn); + mpn_sbpi1_bdiv_q (qp, tp, qn, dp, dn, di); + TMP_FREE; + return; + } + + qn0 = ((nn - dn) >> 1) + 1; /* low quotient size */ + + /* If quotient is much larger than the divisor, the bidirectional algorithm + does not work as currently implemented. Fall back to plain bdiv. */ + if (qn0 > dn) + { + if (BELOW_THRESHOLD (dn, DC_BDIV_Q_THRESHOLD)) + { + tp = scratch; + MPN_COPY (tp, np, qn); + binvert_limb (di, dp[0]); di = -di; + dn = MIN (dn, qn); + mpn_sbpi1_bdiv_q (qp, tp, qn, dp, dn, di); + } + else if (BELOW_THRESHOLD (dn, MU_BDIV_Q_THRESHOLD)) + { + tp = scratch; + MPN_COPY (tp, np, qn); + binvert_limb (di, dp[0]); di = -di; + mpn_dcpi1_bdiv_q (qp, tp, qn, dp, dn, di); + } + else + { + mpn_mu_bdiv_q (qp, np, qn, dp, dn, scratch); + } + TMP_FREE; + return; + } + + nn0 = qn0 + qn0; + + nn1 = nn0 - 1 + ((nn-dn) & 1); + qn1 = qn0; + if (LIKELY (qn0 != dn)) + { + nn1 = nn1 + 1; + qn1 = qn1 + 1; + if (UNLIKELY (dp[dn - 1] == 1 && qn1 != dn)) + { + /* If the leading divisor limb == 1, i.e. has just one bit, we have + to include an extra limb in order to get the needed overlap. */ + /* FIXME: Now with the mu_divappr_q function, we should really need + more overlap. That indicates one of two things: (1) The test code + is not good. (2) We actually overlap too much by default. */ + nn1 = nn1 + 1; + qn1 = qn1 + 1; + } + } + + tp = TMP_ALLOC_LIMBS (nn1 + 1); + + count_leading_zeros (cnt, dp[dn - 1]); + + /* Normalize divisor, store into tmp area. */ + if (cnt != 0) + { + xdp = TMP_ALLOC_LIMBS (qn1); + mpn_lshift (xdp, dp + dn - qn1, qn1, cnt); + } + else + { + xdp = (mp_ptr) dp + dn - qn1; + } + + /* Shift dividend according to the divisor normalization. */ + /* FIXME: We compute too much here for XX_divappr_q, but these functions' + interfaces want a pointer to the imaginative least significant limb, not + to the least significant *used* limb. Of course, we could leave nn1-qn1 + rubbish limbs in the low part, to save some time. */ + if (cnt != 0) + { + cy = mpn_lshift (tp, np + nn - nn1, nn1, cnt); + if (cy != 0) + { + tp[nn1] = cy; + nn1++; + } + } + else + { + /* FIXME: This copy is not needed for mpn_mu_divappr_q, except when the + mpn_sub_n right before is executed. */ + MPN_COPY (tp, np + nn - nn1, nn1); + } + + invert_pi1 (dinv, xdp[qn1 - 1], xdp[qn1 - 2]); + if (BELOW_THRESHOLD (qn1, DC_DIVAPPR_Q_THRESHOLD)) + { + qp[qn0 - 1 + nn1 - qn1] = mpn_sbpi1_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, dinv.inv32); + } + else if (BELOW_THRESHOLD (qn1, MU_DIVAPPR_Q_THRESHOLD)) + { + qp[qn0 - 1 + nn1 - qn1] = mpn_dcpi1_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, &dinv); + } + else + { + /* FIXME: mpn_mu_divappr_q doesn't handle qh != 0. Work around it with a + conditional subtraction here. */ + qh = mpn_cmp (tp + nn1 - qn1, xdp, qn1) >= 0; + if (qh) + mpn_sub_n (tp + nn1 - qn1, tp + nn1 - qn1, xdp, qn1); + mpn_mu_divappr_q (qp + qn0 - 1, tp, nn1, xdp, qn1, scratch); + qp[qn0 - 1 + nn1 - qn1] = qh; + } + qml = qp[qn0 - 1]; + + binvert_limb (di, dp[0]); di = -di; + + if (BELOW_THRESHOLD (qn0, DC_BDIV_Q_THRESHOLD)) + { + MPN_COPY (tp, np, qn0); + mpn_sbpi1_bdiv_q (qp, tp, qn0, dp, qn0, di); + } + else if (BELOW_THRESHOLD (qn0, MU_BDIV_Q_THRESHOLD)) + { + MPN_COPY (tp, np, qn0); + mpn_dcpi1_bdiv_q (qp, tp, qn0, dp, qn0, di); + } + else + { + mpn_mu_bdiv_q (qp, np, qn0, dp, qn0, scratch); + } + + if (qml < qp[qn0 - 1]) + mpn_decr_u (qp + qn0, 1); + + TMP_FREE; +} +#endif diff --git a/gmp-6.3.0/mpn/generic/divis.c b/gmp-6.3.0/mpn/generic/divis.c new file mode 100644 index 0000000..f989ddb --- /dev/null +++ b/gmp-6.3.0/mpn/generic/divis.c @@ -0,0 +1,194 @@ +/* mpn_divisible_p -- mpn by mpn divisibility test + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2001, 2002, 2005, 2009, 2014, 2017, 2018 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* Determine whether A={ap,an} is divisible by D={dp,dn}. Must have both + operands normalized, meaning high limbs non-zero, except that an==0 is + allowed. + + There usually won't be many low zero bits on D, but the checks for this + are fast and might pick up a few operand combinations, in particular they + might reduce D to fit the single-limb mod_1/modexact_1 code. + + Future: + + Getting the remainder limb by limb would make an early exit possible on + finding a non-zero. This would probably have to be bdivmod style so + there's no addback, but it would need a multi-precision inverse and so + might be slower than the plain method (on small sizes at least). + + When D must be normalized (shifted to low bit set), it's possible to + suppress the bit-shifting of A down, as long as it's already been checked + that A has at least as many trailing zero bits as D. */ + +int +mpn_divisible_p (mp_srcptr ap, mp_size_t an, + mp_srcptr dp, mp_size_t dn) +{ + mp_limb_t alow, dlow, dmask; + mp_ptr qp, rp, tp; + mp_limb_t di; + unsigned twos; + int c; + TMP_DECL; + + ASSERT (an >= 0); + ASSERT (an == 0 || ap[an-1] != 0); + ASSERT (dn >= 1); + ASSERT (dp[dn-1] != 0); + ASSERT_MPN (ap, an); + ASSERT_MPN (dp, dn); + + /* When a= 1); + dn--; ASSERT (dn >= 1); + ap++; + dp++; + } + + /* a must have at least as many low zero bits as d */ + dmask = LOW_ZEROS_MASK (dlow); + if ((alow & dmask) != 0) + return 0; + + if (dn == 1) + { + if (ABOVE_THRESHOLD (an, BMOD_1_TO_MOD_1_THRESHOLD)) + return mpn_mod_1 (ap, an, dlow) == 0; + + count_trailing_zeros (twos, dlow); + dlow >>= twos; + return mpn_modexact_1_odd (ap, an, dlow) == 0; + } + + count_trailing_zeros (twos, dlow); + if (dn == 2) + { + mp_limb_t dsecond = dp[1]; + if (dsecond <= dmask) + { + dlow = (dlow >> twos) | (dsecond << (GMP_NUMB_BITS-twos)); + ASSERT_LIMB (dlow); + return MPN_MOD_OR_MODEXACT_1_ODD (ap, an, dlow) == 0; + } + } + + /* Should we compute Q = A * D^(-1) mod B^k, + R = A - Q * D mod B^k + here, for some small values of k? Then check if R = 0 (mod B^k). */ + + /* We could also compute A' = A mod T and D' = D mod P, for some + P = 3 * 5 * 7 * 11 ..., and then check if any prime factor from P + dividing D' also divides A'. */ + + TMP_MARK; + + TMP_ALLOC_LIMBS_2 (rp, an + 1, + qp, an - dn + 1); /* FIXME: Could we avoid this? */ + + if (twos != 0) + { + tp = TMP_ALLOC_LIMBS (dn); + ASSERT_NOCARRY (mpn_rshift (tp, dp, dn, twos)); + dp = tp; + + ASSERT_NOCARRY (mpn_rshift (rp, ap, an, twos)); + } + else + { + MPN_COPY (rp, ap, an); + } + if (rp[an - 1] >= dp[dn - 1]) + { + rp[an] = 0; + an++; + } + else if (an == dn) + { + TMP_FREE; + return 0; + } + + ASSERT (an > dn); /* requirement of functions below */ + + if (BELOW_THRESHOLD (dn, DC_BDIV_QR_THRESHOLD) || + BELOW_THRESHOLD (an - dn, DC_BDIV_QR_THRESHOLD)) + { + binvert_limb (di, dp[0]); + mpn_sbpi1_bdiv_qr (qp, rp, an, dp, dn, -di); + rp += an - dn; + } + else if (BELOW_THRESHOLD (dn, MU_BDIV_QR_THRESHOLD)) + { + binvert_limb (di, dp[0]); + mpn_dcpi1_bdiv_qr (qp, rp, an, dp, dn, -di); + rp += an - dn; + } + else + { + tp = TMP_ALLOC_LIMBS (mpn_mu_bdiv_qr_itch (an, dn)); + mpn_mu_bdiv_qr (qp, rp, rp, an, dp, dn, tp); + } + + /* In general, bdiv may return either R = 0 or R = D when D divides + A. But R = 0 can happen only when A = 0, which we already have + excluded. Furthermore, R == D (mod B^{dn}) implies no carry, so + we don't need to check the carry returned from bdiv. */ + + MPN_CMP (c, rp, dp, dn); + + TMP_FREE; + return c == 0; +} diff --git a/gmp-6.3.0/mpn/generic/divrem.c b/gmp-6.3.0/mpn/generic/divrem.c new file mode 100644 index 0000000..1da84a8 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/divrem.c @@ -0,0 +1,103 @@ +/* mpn_divrem -- Divide natural numbers, producing both remainder and + quotient. This is now just a middle layer calling mpn_tdiv_qr. + +Copyright 1993-1997, 1999-2002, 2005, 2016 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_divrem (mp_ptr qp, mp_size_t qxn, + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn) +{ + ASSERT (qxn >= 0); + ASSERT (nn >= dn); + ASSERT (dn >= 1); + ASSERT (dp[dn-1] & GMP_NUMB_HIGHBIT); + ASSERT (! MPN_OVERLAP_P (np, nn, dp, dn)); + ASSERT (! MPN_OVERLAP_P (qp, nn-dn+qxn, np, nn) || qp==np+dn+qxn); + ASSERT (! MPN_OVERLAP_P (qp, nn-dn+qxn, dp, dn)); + ASSERT_MPN (np, nn); + ASSERT_MPN (dp, dn); + + if (dn == 1) + { + mp_limb_t ret; + mp_ptr q2p; + mp_size_t qn; + TMP_DECL; + + TMP_MARK; + q2p = TMP_ALLOC_LIMBS (nn + qxn); + + np[0] = mpn_divrem_1 (q2p, qxn, np, nn, dp[0]); + qn = nn + qxn - 1; + MPN_COPY (qp, q2p, qn); + ret = q2p[qn]; + + TMP_FREE; + return ret; + } + else if (dn == 2) + { + return mpn_divrem_2 (qp, qxn, np, nn, dp); + } + else + { + mp_ptr q2p; + mp_limb_t qhl; + mp_size_t qn; + TMP_DECL; + + TMP_MARK; + if (UNLIKELY (qxn != 0)) + { + mp_ptr n2p; + TMP_ALLOC_LIMBS_2 (n2p, nn + qxn, + q2p, nn - dn + qxn + 1); + MPN_ZERO (n2p, qxn); + MPN_COPY (n2p + qxn, np, nn); + mpn_tdiv_qr (q2p, np, 0L, n2p, nn + qxn, dp, dn); + qn = nn - dn + qxn; + MPN_COPY (qp, q2p, qn); + qhl = q2p[qn]; + } + else + { + q2p = TMP_ALLOC_LIMBS (nn - dn + 1); + mpn_tdiv_qr (q2p, np, 0L, np, nn, dp, dn); + qn = nn - dn; + MPN_COPY (qp, q2p, qn); + qhl = q2p[qn]; + } + TMP_FREE; + return qhl; + } +} diff --git a/gmp-6.3.0/mpn/generic/divrem_1.c b/gmp-6.3.0/mpn/generic/divrem_1.c new file mode 100644 index 0000000..c13aa79 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/divrem_1.c @@ -0,0 +1,254 @@ +/* mpn_divrem_1 -- mpn by limb division. + +Copyright 1991, 1993, 1994, 1996, 1998-2000, 2002, 2003 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* The size where udiv_qrnnd_preinv should be used rather than udiv_qrnnd, + meaning the quotient size where that should happen, the quotient size + being how many udiv divisions will be done. + + The default is to use preinv always, CPUs where this doesn't suit have + tuned thresholds. Note in particular that preinv should certainly be + used if that's the only division available (USE_PREINV_ALWAYS). */ + +#ifndef DIVREM_1_NORM_THRESHOLD +#define DIVREM_1_NORM_THRESHOLD 0 +#endif +#ifndef DIVREM_1_UNNORM_THRESHOLD +#define DIVREM_1_UNNORM_THRESHOLD 0 +#endif + + + +/* If the cpu only has multiply-by-inverse division (eg. alpha), then NORM + and UNNORM thresholds are 0 and only the inversion code is included. + + If multiply-by-inverse is never viable, then NORM and UNNORM thresholds + will be MP_SIZE_T_MAX and only the plain division code is included. + + Otherwise mul-by-inverse is better than plain division above some + threshold, and best results are obtained by having code for both present. + + The main reason for separating the norm and unnorm cases is that not all + CPUs give zero for "n0 >> GMP_LIMB_BITS" which would arise in the unnorm + code used on an already normalized divisor. + + If UDIV_NEEDS_NORMALIZATION is false then plain division uses the same + non-shifting code for both the norm and unnorm cases, though with + different criteria for skipping a division, and with different thresholds + of course. And in fact if inversion is never viable, then that simple + non-shifting division would be all that's left. + + The NORM and UNNORM thresholds might not differ much, but if there's + going to be separate code for norm and unnorm then it makes sense to have + separate thresholds. One thing that's possible is that the + mul-by-inverse might be better only for normalized divisors, due to that + case not needing variable bit shifts. + + Notice that the thresholds are tested after the decision to possibly skip + one divide step, so they're based on the actual number of divisions done. + + For the unnorm case, it would be possible to call mpn_lshift to adjust + the dividend all in one go (into the quotient space say), rather than + limb-by-limb in the loop. This might help if mpn_lshift is a lot faster + than what the compiler can generate for EXTRACT. But this is left to CPU + specific implementations to consider, especially since EXTRACT isn't on + the dependent chain. */ + +mp_limb_t +mpn_divrem_1 (mp_ptr qp, mp_size_t qxn, + mp_srcptr up, mp_size_t un, mp_limb_t d) +{ + mp_size_t n; + mp_size_t i; + mp_limb_t n1, n0; + mp_limb_t r = 0; + + ASSERT (qxn >= 0); + ASSERT (un >= 0); + ASSERT (d != 0); + /* FIXME: What's the correct overlap rule when qxn!=0? */ + ASSERT (MPN_SAME_OR_SEPARATE_P (qp+qxn, up, un)); + + n = un + qxn; + if (n == 0) + return 0; + + d <<= GMP_NAIL_BITS; + + qp += (n - 1); /* Make qp point at most significant quotient limb */ + + if ((d & GMP_LIMB_HIGHBIT) != 0) + { + if (un != 0) + { + /* High quotient limb is 0 or 1, skip a divide step. */ + mp_limb_t q; + r = up[un - 1] << GMP_NAIL_BITS; + q = (r >= d); + *qp-- = q; + r -= (d & -q); + r >>= GMP_NAIL_BITS; + n--; + un--; + } + + if (BELOW_THRESHOLD (n, DIVREM_1_NORM_THRESHOLD)) + { + plain: + for (i = un - 1; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + udiv_qrnnd (*qp, r, r, n0, d); + r >>= GMP_NAIL_BITS; + qp--; + } + for (i = qxn - 1; i >= 0; i--) + { + udiv_qrnnd (*qp, r, r, CNST_LIMB(0), d); + r >>= GMP_NAIL_BITS; + qp--; + } + return r; + } + else + { + /* Multiply-by-inverse, divisor already normalized. */ + mp_limb_t dinv; + invert_limb (dinv, d); + + for (i = un - 1; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + udiv_qrnnd_preinv (*qp, r, r, n0, d, dinv); + r >>= GMP_NAIL_BITS; + qp--; + } + for (i = qxn - 1; i >= 0; i--) + { + udiv_qrnnd_preinv (*qp, r, r, CNST_LIMB(0), d, dinv); + r >>= GMP_NAIL_BITS; + qp--; + } + return r; + } + } + else + { + /* Most significant bit of divisor == 0. */ + int cnt; + + /* Skip a division if high < divisor (high quotient 0). Testing here + before normalizing will still skip as often as possible. */ + if (un != 0) + { + n1 = up[un - 1] << GMP_NAIL_BITS; + if (n1 < d) + { + r = n1 >> GMP_NAIL_BITS; + *qp-- = 0; + n--; + if (n == 0) + return r; + un--; + } + } + + if (! UDIV_NEEDS_NORMALIZATION + && BELOW_THRESHOLD (n, DIVREM_1_UNNORM_THRESHOLD)) + goto plain; + + count_leading_zeros (cnt, d); + d <<= cnt; + r <<= cnt; + + if (UDIV_NEEDS_NORMALIZATION + && BELOW_THRESHOLD (n, DIVREM_1_UNNORM_THRESHOLD)) + { + mp_limb_t nshift; + if (un != 0) + { + n1 = up[un - 1] << GMP_NAIL_BITS; + r |= (n1 >> (GMP_LIMB_BITS - cnt)); + for (i = un - 2; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + nshift = (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt)); + udiv_qrnnd (*qp, r, r, nshift, d); + r >>= GMP_NAIL_BITS; + qp--; + n1 = n0; + } + udiv_qrnnd (*qp, r, r, n1 << cnt, d); + r >>= GMP_NAIL_BITS; + qp--; + } + for (i = qxn - 1; i >= 0; i--) + { + udiv_qrnnd (*qp, r, r, CNST_LIMB(0), d); + r >>= GMP_NAIL_BITS; + qp--; + } + return r >> cnt; + } + else + { + mp_limb_t dinv, nshift; + invert_limb (dinv, d); + if (un != 0) + { + n1 = up[un - 1] << GMP_NAIL_BITS; + r |= (n1 >> (GMP_LIMB_BITS - cnt)); + for (i = un - 2; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + nshift = (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt)); + udiv_qrnnd_preinv (*qp, r, r, nshift, d, dinv); + r >>= GMP_NAIL_BITS; + qp--; + n1 = n0; + } + udiv_qrnnd_preinv (*qp, r, r, n1 << cnt, d, dinv); + r >>= GMP_NAIL_BITS; + qp--; + } + for (i = qxn - 1; i >= 0; i--) + { + udiv_qrnnd_preinv (*qp, r, r, CNST_LIMB(0), d, dinv); + r >>= GMP_NAIL_BITS; + qp--; + } + return r >> cnt; + } + } +} diff --git a/gmp-6.3.0/mpn/generic/divrem_2.c b/gmp-6.3.0/mpn/generic/divrem_2.c new file mode 100644 index 0000000..217f2f6 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/divrem_2.c @@ -0,0 +1,118 @@ +/* mpn_divrem_2 -- Divide natural numbers, producing both remainder and + quotient. The divisor is two limbs. + + THIS FILE CONTAINS INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright 1993-1996, 1999-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* Divide num {np,nn} by den {dp,2} and write the nn-2 least significant + quotient limbs at qp and the 2 long remainder at np. If qxn is non-zero, + generate that many fraction bits and append them after the other quotient + limbs. Return the most significant limb of the quotient, this is always 0 + or 1. + + Preconditions: + 1. The most significant bit of the divisor must be set. + 2. qp must either not overlap with the input operands at all, or + qp >= np + 2 must hold true. (This means that it's possible to put + the quotient in the high part of {np,nn}, right above the remainder. + 3. nn >= 2, even if qxn is non-zero. */ + +mp_limb_t +mpn_divrem_2 (mp_ptr qp, mp_size_t qxn, + mp_ptr np, mp_size_t nn, + mp_srcptr dp) +{ + mp_limb_t most_significant_q_limb; + mp_size_t i; + mp_limb_t r1, r0, d1, d0; + gmp_pi1_t di; + + ASSERT (nn >= 2); + ASSERT (qxn >= 0); + ASSERT (dp[1] & GMP_NUMB_HIGHBIT); + ASSERT (! MPN_OVERLAP_P (qp, nn-2+qxn, np, nn) || qp >= np+2); + ASSERT_MPN (np, nn); + ASSERT_MPN (dp, 2); + + np += nn - 2; + d1 = dp[1]; + d0 = dp[0]; + r1 = np[1]; + r0 = np[0]; + + most_significant_q_limb = 0; + if (r1 >= d1 && (r1 > d1 || r0 >= d0)) + { +#if GMP_NAIL_BITS == 0 + sub_ddmmss (r1, r0, r1, r0, d1, d0); +#else + r0 = r0 - d0; + r1 = r1 - d1 - (r0 >> GMP_LIMB_BITS - 1); + r0 &= GMP_NUMB_MASK; +#endif + most_significant_q_limb = 1; + } + + invert_pi1 (di, d1, d0); + + qp += qxn; + + for (i = nn - 2 - 1; i >= 0; i--) + { + mp_limb_t n0, q; + n0 = np[-1]; + udiv_qr_3by2 (q, r1, r0, r1, r0, n0, d1, d0, di.inv32); + np--; + qp[i] = q; + } + + if (UNLIKELY (qxn != 0)) + { + qp -= qxn; + for (i = qxn - 1; i >= 0; i--) + { + mp_limb_t q; + udiv_qr_3by2 (q, r1, r0, r1, r0, CNST_LIMB(0), d1, d0, di.inv32); + qp[i] = q; + } + } + + np[1] = r1; + np[0] = r0; + + return most_significant_q_limb; +} diff --git a/gmp-6.3.0/mpn/generic/dump.c b/gmp-6.3.0/mpn/generic/dump.c new file mode 100644 index 0000000..9a4ddf4 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/dump.c @@ -0,0 +1,99 @@ +/* THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS NOT SAFE TO + CALL THIS FUNCTION DIRECTLY. IN FACT, IT IS ALMOST GUARANTEED THAT THIS + FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + + +Copyright 1996, 2000-2002, 2005 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" + +#if GMP_NUMB_BITS % 4 == 0 +void +mpn_dump (mp_srcptr ptr, mp_size_t n) +{ + MPN_NORMALIZE (ptr, n); + + if (n == 0) + printf ("0\n"); + else + { + n--; +#if _LONG_LONG_LIMB + if ((ptr[n] >> GMP_LIMB_BITS / 2) != 0) + { + printf ("%lX", (unsigned long) (ptr[n] >> GMP_LIMB_BITS / 2)); + printf ("%0*lX", (GMP_LIMB_BITS / 2 / 4), (unsigned long) ptr[n]); + } + else +#endif + printf ("%lX", (unsigned long) ptr[n]); + + while (n) + { + n--; +#if _LONG_LONG_LIMB + printf ("%0*lX", (GMP_NUMB_BITS - GMP_LIMB_BITS / 2) / 4, + (unsigned long) (ptr[n] >> GMP_LIMB_BITS / 2)); + printf ("%0*lX", GMP_LIMB_BITS / 2 / 4, (unsigned long) ptr[n]); +#else + printf ("%0*lX", GMP_NUMB_BITS / 4, (unsigned long) ptr[n]); +#endif + } + printf ("\n"); + } +} + +#else + +static void +mpn_recdump (mp_ptr p, mp_size_t n) +{ + mp_limb_t lo; + if (n != 0) + { + lo = p[0] & 0xf; + mpn_rshift (p, p, n, 4); + mpn_recdump (p, n); + printf ("%lX", lo); + } +} + +void +mpn_dump (mp_srcptr p, mp_size_t n) +{ + mp_ptr tp; + TMP_DECL; + TMP_MARK; + tp = TMP_ALLOC_LIMBS (n); + MPN_COPY (tp, p, n); + TMP_FREE; +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/fib2_ui.c b/gmp-6.3.0/mpn/generic/fib2_ui.c new file mode 100644 index 0000000..0b81571 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/fib2_ui.c @@ -0,0 +1,174 @@ +/* mpn_fib2_ui -- calculate Fibonacci numbers. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2001, 2002, 2005, 2009, 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" + +/* change this to "#define TRACE(x) x" for diagnostics */ +#define TRACE(x) + + +/* Store F[n] at fp and F[n-1] at f1p. fp and f1p should have room for + MPN_FIB2_SIZE(n) limbs. + + The return value is the actual number of limbs stored, this will be at + least 1. fp[size-1] will be non-zero, except when n==0, in which case + fp[0] is 0 and f1p[0] is 1. f1p[size-1] can be zero, since F[n-1]0). + + Notes: F[2k+1] = 4*F[k]^2 - F[k-1]^2 + 2*(-1)^k. + + In F[2k+1] with k even, +2 is applied to 4*F[k]^2 just by ORing into the + low limb. + + In F[2k+1] with k odd, -2 is applied to F[k-1]^2 just by ORing into the + low limb. +*/ + +mp_size_t +mpn_fib2_ui (mp_ptr fp, mp_ptr f1p, unsigned long int n) +{ + mp_size_t size; + unsigned long nfirst, mask; + + TRACE (printf ("mpn_fib2_ui n=%lu\n", n)); + + ASSERT (! MPN_OVERLAP_P (fp, MPN_FIB2_SIZE(n), f1p, MPN_FIB2_SIZE(n))); + + /* Take a starting pair from the table. */ + mask = 1; + for (nfirst = n; nfirst > FIB_TABLE_LIMIT; nfirst /= 2) + mask <<= 1; + TRACE (printf ("nfirst=%lu mask=0x%lX\n", nfirst, mask)); + + f1p[0] = FIB_TABLE ((int) nfirst - 1); + fp[0] = FIB_TABLE (nfirst); + size = 1; + + /* Skip to the end if the table lookup gives the final answer. */ + if (mask != 1) + { + mp_size_t alloc; + mp_ptr xp; + TMP_DECL; + + TMP_MARK; + alloc = MPN_FIB2_SIZE (n); + xp = TMP_ALLOC_LIMBS (alloc); + + do + { + /* Here fp==F[k] and f1p==F[k-1], with k being the bits of n from + n&mask upwards. + + The next bit of n is n&(mask>>1) and we'll double to the pair + fp==F[2k],f1p==F[2k-1] or fp==F[2k+1],f1p==F[2k], according as + that bit is 0 or 1 respectively. */ + + TRACE (printf ("k=%lu mask=0x%lX size=%ld alloc=%ld\n", + n >> refmpn_count_trailing_zeros(mask), + mask, size, alloc); + mpn_trace ("fp ", fp, size); + mpn_trace ("f1p", f1p, size)); + + /* fp normalized, f1p at most one high zero */ + ASSERT (fp[size-1] != 0); + ASSERT (f1p[size-1] != 0 || f1p[size-2] != 0); + + /* f1p[size-1] might be zero, but this occurs rarely, so it's not + worth bothering checking for it */ + ASSERT (alloc >= 2*size); + mpn_sqr (xp, fp, size); + mpn_sqr (fp, f1p, size); + size *= 2; + + /* Shrink if possible. Since fp was normalized there'll be at + most one high zero on xp (and if there is then there's one on + yp too). */ + ASSERT (xp[size-1] != 0 || fp[size-1] == 0); + size -= (xp[size-1] == 0); + ASSERT (xp[size-1] != 0); /* only one xp high zero */ + + /* Calculate F[2k-1] = F[k]^2 + F[k-1]^2. */ + f1p[size] = mpn_add_n (f1p, xp, fp, size); + + /* Calculate F[2k+1] = 4*F[k]^2 - F[k-1]^2 + 2*(-1)^k. + n&mask is the low bit of our implied k. */ + + ASSERT ((fp[0] & 2) == 0); + /* fp is F[k-1]^2 == 0 or 1 mod 4, like all squares. */ + fp[0] |= (n & mask ? 2 : 0); /* possible -2 */ +#if HAVE_NATIVE_mpn_rsblsh2_n + fp[size] = mpn_rsblsh2_n (fp, fp, xp, size); + MPN_INCR_U(fp, size + 1, (n & mask ? 0 : 2)); /* possible +2 */ +#else + { + mp_limb_t c; + + c = mpn_lshift (xp, xp, size, 2); + xp[0] |= (n & mask ? 0 : 2); /* possible +2 */ + c -= mpn_sub_n (fp, xp, fp, size); + fp[size] = c; + } +#endif + ASSERT (alloc >= size+1); + size += (fp[size] != 0); + + /* now n&mask is the new bit of n being considered */ + mask >>= 1; + + /* Calculate F[2k] = F[2k+1] - F[2k-1], replacing the unwanted one of + F[2k+1] and F[2k-1]. */ + if (n & mask) + ASSERT_NOCARRY (mpn_sub_n (f1p, fp, f1p, size)); + else { + ASSERT_NOCARRY (mpn_sub_n ( fp, fp, f1p, size)); + + /* Can have a high zero after replacing F[2k+1] with F[2k]. + f1p will have a high zero if fp does. */ + ASSERT (fp[size-1] != 0 || f1p[size-1] == 0); + size -= (fp[size-1] == 0); + } + } + while (mask != 1); + + TMP_FREE; + } + + TRACE (printf ("done size=%ld\n", size); + mpn_trace ("fp ", fp, size); + mpn_trace ("f1p", f1p, size)); + + return size; +} diff --git a/gmp-6.3.0/mpn/generic/fib2m.c b/gmp-6.3.0/mpn/generic/fib2m.c new file mode 100644 index 0000000..89d2b86 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/fib2m.c @@ -0,0 +1,252 @@ +/* mpn_fib2m -- calculate Fibonacci numbers, modulo m. + +Contributed to the GNU project by Marco Bodrato, based on the previous +fib2_ui.c file. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2001, 2002, 2005, 2009, 2018 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" +#include "longlong.h" + + +/* Stores |{ap,n}-{bp,n}| in {rp,n}, + returns the sign of {ap,n}-{bp,n}. */ +static int +abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) +{ + mp_limb_t x, y; + while (--n >= 0) + { + x = ap[n]; + y = bp[n]; + if (x != y) + { + ++n; + if (x > y) + { + ASSERT_NOCARRY (mpn_sub_n (rp, ap, bp, n)); + return 1; + } + else + { + ASSERT_NOCARRY (mpn_sub_n (rp, bp, ap, n)); + return -1; + } + } + rp[n] = 0; + } + return 0; +} + +/* Store F[n] at fp and F[n-1] at f1p. Both are computed modulo m. + fp and f1p should have room for mn*2+1 limbs. + + The sign of one or both the values may be flipped (n-F, instead of F), + the return value is 0 (zero) if the signs are coherent (both positive + or both negative) and 1 (one) otherwise. + + Notes: + + In F[2k+1] with k even, +2 is applied to 4*F[k]^2 just by ORing into the + low limb. + + In F[2k+1] with k odd, -2 is applied to F[k-1]^2 just by ORing into the + low limb. + + TODO: Should {tp, 2 * mn} be passed as a scratch pointer? + Should the call to mpn_fib2_ui() obtain (up to) 2*mn limbs? +*/ + +int +mpn_fib2m (mp_ptr fp, mp_ptr f1p, mp_srcptr np, mp_size_t nn, mp_srcptr mp, mp_size_t mn) +{ + unsigned long nfirst; + mp_limb_t nh; + mp_bitcnt_t nbi; + mp_size_t sn, fn; + int fcnt, ncnt; + + ASSERT (! MPN_OVERLAP_P (fp, MAX(2*mn+1,5), f1p, MAX(2*mn+1,5))); + ASSERT (nn > 0 && np[nn - 1] != 0); + + /* Estimate the maximal n such that fibonacci(n) fits in mn limbs. */ +#if GMP_NUMB_BITS % 16 == 0 + if (UNLIKELY (ULONG_MAX / (23 * (GMP_NUMB_BITS / 16)) <= mn)) + nfirst = ULONG_MAX; + else + nfirst = mn * (23 * (GMP_NUMB_BITS / 16)); +#else + { + mp_bitcnt_t mbi; + mbi = (mp_bitcnt_t) mn * GMP_NUMB_BITS; + + if (UNLIKELY (ULONG_MAX / 23 < mbi)) + { + if (UNLIKELY (ULONG_MAX / 23 * 16 <= mbi)) + nfirst = ULONG_MAX; + else + nfirst = mbi / 16 * 23; + } + else + nfirst = mbi * 23 / 16; + } +#endif + + sn = nn - 1; + nh = np[sn]; + count_leading_zeros (ncnt, nh); + count_leading_zeros (fcnt, nfirst); + + if (fcnt >= ncnt) + { + ncnt = fcnt - ncnt; + nh >>= ncnt; + } + else if (sn > 0) + { + ncnt -= fcnt; + nh <<= ncnt; + ncnt = GMP_NUMB_BITS - ncnt; + --sn; + nh |= np[sn] >> ncnt; + } + else + ncnt = 0; + + nbi = sn * GMP_NUMB_BITS + ncnt; + if (nh > nfirst) + { + nh >>= 1; + ++nbi; + } + + ASSERT (nh <= nfirst); + /* Take a starting pair from mpn_fib2_ui. */ + fn = mpn_fib2_ui (fp, f1p, nh); + MPN_ZERO (fp + fn, mn - fn); + MPN_ZERO (f1p + fn, mn - fn); + + if (nbi == 0) + { + if (fn == mn) + { + mp_limb_t qp[2]; + mpn_tdiv_qr (qp, fp, 0, fp, fn, mp, mn); + mpn_tdiv_qr (qp, f1p, 0, f1p, fn, mp, mn); + } + + return 0; + } + else + { + mp_ptr tp; + unsigned pb = nh & 1; + int neg; + TMP_DECL; + + TMP_MARK; + + tp = TMP_ALLOC_LIMBS (2 * mn + (mn < 2)); + + do + { + mp_ptr rp; + /* Here fp==F[k] and f1p==F[k-1], with k being the bits of n from + nbi upwards. + + Based on the next bit of n, we'll double to the pair + fp==F[2k],f1p==F[2k-1] or fp==F[2k+1],f1p==F[2k], according as + that bit is 0 or 1 respectively. */ + + mpn_sqr (tp, fp, mn); + mpn_sqr (fp, f1p, mn); + + /* Calculate F[2k-1] = F[k]^2 + F[k-1]^2. */ + f1p[2 * mn] = mpn_add_n (f1p, tp, fp, 2 * mn); + + /* Calculate F[2k+1] = 4*F[k]^2 - F[k-1]^2 + 2*(-1)^k. + pb is the low bit of our implied k. */ + + /* fp is F[k-1]^2 == 0 or 1 mod 4, like all squares. */ + ASSERT ((fp[0] & 2) == 0); + ASSERT (pb == (pb & 1)); + ASSERT ((fp[0] + (pb ? 2 : 0)) == (fp[0] | (pb << 1))); + fp[0] |= pb << 1; /* possible -2 */ +#if HAVE_NATIVE_mpn_rsblsh2_n + fp[2 * mn] = 1 + mpn_rsblsh2_n (fp, fp, tp, 2 * mn); + MPN_INCR_U(fp, 2 * mn + 1, (1 ^ pb) << 1); /* possible +2 */ + fp[2 * mn] = (fp[2 * mn] - 1) & GMP_NUMB_MAX; +#else + { + mp_limb_t c; + + c = mpn_lshift (tp, tp, 2 * mn, 2); + tp[0] |= (1 ^ pb) << 1; /* possible +2 */ + c -= mpn_sub_n (fp, tp, fp, 2 * mn); + fp[2 * mn] = c & GMP_NUMB_MAX; + } +#endif + neg = fp[2 * mn] == GMP_NUMB_MAX; + + /* Calculate F[2k-1] = F[k]^2 + F[k-1]^2 */ + /* Calculate F[2k+1] = 4*F[k]^2 - F[k-1]^2 + 2*(-1)^k */ + + /* Calculate F[2k] = F[2k+1] - F[2k-1], replacing the unwanted one of + F[2k+1] and F[2k-1]. */ + --nbi; + pb = (np [nbi / GMP_NUMB_BITS] >> (nbi % GMP_NUMB_BITS)) & 1; + rp = pb ? f1p : fp; + if (neg) + { + /* Calculate -(F[2k+1] - F[2k-1]) */ + rp[2 * mn] = f1p[2 * mn] + 1 - mpn_sub_n (rp, f1p, fp, 2 * mn); + neg = ! pb; + if (pb) /* fp not overwritten, negate it. */ + fp [2 * mn] = 1 ^ mpn_neg (fp, fp, 2 * mn); + } + else + { + neg = abs_sub_n (rp, fp, f1p, 2 * mn + 1) < 0; + } + + mpn_tdiv_qr (tp, fp, 0, fp, 2 * mn + 1, mp, mn); + mpn_tdiv_qr (tp, f1p, 0, f1p, 2 * mn + 1, mp, mn); + } + while (nbi != 0); + + TMP_FREE; + + return neg; + } +} diff --git a/gmp-6.3.0/mpn/generic/gcd.c b/gmp-6.3.0/mpn/generic/gcd.c new file mode 100644 index 0000000..3f92cbf --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gcd.c @@ -0,0 +1,266 @@ +/* mpn/gcd.c: mpn_gcd for gcd of two odd integers. + +Copyright 1991, 1993-1998, 2000-2005, 2008, 2010, 2012, 2019 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Uses the HGCD operation described in + + N. Möller, On Schönhage's algorithm and subquadratic integer gcd + computation, Math. Comp. 77 (2008), 589-607. + + to reduce inputs until they are of size below GCD_DC_THRESHOLD, and + then uses Lehmer's algorithm. +*/ + +/* Some reasonable choices are n / 2 (same as in hgcd), and p = (n + + * 2)/3, which gives a balanced multiplication in + * mpn_hgcd_matrix_adjust. However, p = 2 n/3 gives slightly better + * performance. The matrix-vector multiplication is then + * 4:1-unbalanced, with matrix elements of size n/6, and vector + * elements of size p = 2n/3. */ + +/* From analysis of the theoretical running time, it appears that when + * multiplication takes time O(n^alpha), p should be chosen so that + * the ratio of the time for the mpn_hgcd call, and the time for the + * multiplication in mpn_hgcd_matrix_adjust, is roughly 1/(alpha - + * 1). */ +#ifdef TUNE_GCD_P +#define P_TABLE_SIZE 10000 +mp_size_t p_table[P_TABLE_SIZE]; +#define CHOOSE_P(n) ( (n) < P_TABLE_SIZE ? p_table[n] : 2*(n)/3) +#else +#define CHOOSE_P(n) (2*(n) / 3) +#endif + +struct gcd_ctx +{ + mp_ptr gp; + mp_size_t gn; +}; + +static void +gcd_hook (void *p, mp_srcptr gp, mp_size_t gn, + mp_srcptr qp, mp_size_t qn, int d) +{ + struct gcd_ctx *ctx = (struct gcd_ctx *) p; + MPN_COPY (ctx->gp, gp, gn); + ctx->gn = gn; +} + +mp_size_t +mpn_gcd (mp_ptr gp, mp_ptr up, mp_size_t usize, mp_ptr vp, mp_size_t n) +{ + mp_size_t talloc; + mp_size_t scratch; + mp_size_t matrix_scratch; + + struct gcd_ctx ctx; + mp_ptr tp; + TMP_DECL; + + ASSERT (usize >= n); + ASSERT (n > 0); + ASSERT (vp[n-1] > 0); + + /* FIXME: Check for small sizes first, before setting up temporary + storage etc. */ + talloc = MPN_GCD_SUBDIV_STEP_ITCH(n); + + /* For initial division */ + scratch = usize - n + 1; + if (scratch > talloc) + talloc = scratch; + +#if TUNE_GCD_P + if (CHOOSE_P (n) > 0) +#else + if (ABOVE_THRESHOLD (n, GCD_DC_THRESHOLD)) +#endif + { + mp_size_t hgcd_scratch; + mp_size_t update_scratch; + mp_size_t p = CHOOSE_P (n); + mp_size_t scratch; +#if TUNE_GCD_P + /* Worst case, since we don't guarantee that n - CHOOSE_P(n) + is increasing */ + matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n); + hgcd_scratch = mpn_hgcd_itch (n); + update_scratch = 2*(n - 1); +#else + matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p); + hgcd_scratch = mpn_hgcd_itch (n - p); + update_scratch = p + n - 1; +#endif + scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); + if (scratch > talloc) + talloc = scratch; + } + + TMP_MARK; + tp = TMP_ALLOC_LIMBS(talloc); + + if (usize > n) + { + mpn_tdiv_qr (tp, up, 0, up, usize, vp, n); + + if (mpn_zero_p (up, n)) + { + MPN_COPY (gp, vp, n); + ctx.gn = n; + goto done; + } + } + + ctx.gp = gp; + +#if TUNE_GCD_P + while (CHOOSE_P (n) > 0) +#else + while (ABOVE_THRESHOLD (n, GCD_DC_THRESHOLD)) +#endif + { + struct hgcd_matrix M; + mp_size_t p = CHOOSE_P (n); + mp_size_t matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p); + mp_size_t nn; + mpn_hgcd_matrix_init (&M, n - p, tp); + nn = mpn_hgcd (up + p, vp + p, n - p, &M, tp + matrix_scratch); + if (nn > 0) + { + ASSERT (M.n <= (n - p - 1)/2); + ASSERT (M.n + p <= (p + n - 1) / 2); + /* Temporary storage 2 (p + M->n) <= p + n - 1. */ + n = mpn_hgcd_matrix_adjust (&M, p + nn, up, vp, p, tp + matrix_scratch); + } + else + { + /* Temporary storage n */ + n = mpn_gcd_subdiv_step (up, vp, n, 0, gcd_hook, &ctx, tp); + if (n == 0) + goto done; + } + } + + while (n > 2) + { + struct hgcd_matrix1 M; + mp_limb_t uh, ul, vh, vl; + mp_limb_t mask; + + mask = up[n-1] | vp[n-1]; + ASSERT (mask > 0); + + if (mask & GMP_NUMB_HIGHBIT) + { + uh = up[n-1]; ul = up[n-2]; + vh = vp[n-1]; vl = vp[n-2]; + } + else + { + int shift; + + count_leading_zeros (shift, mask); + uh = MPN_EXTRACT_NUMB (shift, up[n-1], up[n-2]); + ul = MPN_EXTRACT_NUMB (shift, up[n-2], up[n-3]); + vh = MPN_EXTRACT_NUMB (shift, vp[n-1], vp[n-2]); + vl = MPN_EXTRACT_NUMB (shift, vp[n-2], vp[n-3]); + } + + /* Try an mpn_hgcd2 step */ + if (mpn_hgcd2 (uh, ul, vh, vl, &M)) + { + n = mpn_matrix22_mul1_inverse_vector (&M, tp, up, vp, n); + MP_PTR_SWAP (up, tp); + } + else + { + /* mpn_hgcd2 has failed. Then either one of a or b is very + small, or the difference is very small. Perform one + subtraction followed by one division. */ + + /* Temporary storage n */ + n = mpn_gcd_subdiv_step (up, vp, n, 0, &gcd_hook, &ctx, tp); + if (n == 0) + goto done; + } + } + + ASSERT(up[n-1] | vp[n-1]); + + /* Due to the calling convention for mpn_gcd, at most one can be even. */ + if ((up[0] & 1) == 0) + MP_PTR_SWAP (up, vp); + ASSERT ((up[0] & 1) != 0); + + { + mp_limb_t u0, u1, v0, v1; + mp_double_limb_t g; + + u0 = up[0]; + v0 = vp[0]; + + if (n == 1) + { + int cnt; + count_trailing_zeros (cnt, v0); + *gp = mpn_gcd_11 (u0, v0 >> cnt); + ctx.gn = 1; + goto done; + } + + v1 = vp[1]; + if (UNLIKELY (v0 == 0)) + { + v0 = v1; + v1 = 0; + /* FIXME: We could invoke a mpn_gcd_21 here, just like mpn_gcd_22 could + when this situation occurs internally. */ + } + if ((v0 & 1) == 0) + { + int cnt; + count_trailing_zeros (cnt, v0); + v0 = ((v1 << (GMP_NUMB_BITS - cnt)) & GMP_NUMB_MASK) | (v0 >> cnt); + v1 >>= cnt; + } + + u1 = up[1]; + g = mpn_gcd_22 (u1, u0, v1, v0); + gp[0] = g.d0; + gp[1] = g.d1; + ctx.gn = 1 + (g.d1 > 0); + } +done: + TMP_FREE; + return ctx.gn; +} diff --git a/gmp-6.3.0/mpn/generic/gcd_1.c b/gmp-6.3.0/mpn/generic/gcd_1.c new file mode 100644 index 0000000..22b1422 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gcd_1.c @@ -0,0 +1,103 @@ +/* mpn_gcd_1 -- mpn and limb greatest common divisor. + +Copyright 1994, 1996, 2000, 2001, 2009, 2012, 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Does not work for U == 0 or V == 0. It would be tough to make it work for + V == 0 since gcd(x,0) = x, and U does not generally fit in an mp_limb_t. + + The threshold for doing u%v when size==1 will vary by CPU according to + the speed of a division and the code generated for the main loop. Any + tuning for this is left to a CPU specific implementation. */ + +mp_limb_t +mpn_gcd_1 (mp_srcptr up, mp_size_t size, mp_limb_t vlimb) +{ + mp_limb_t ulimb; + unsigned long zero_bits, u_low_zero_bits; + int c; + + ASSERT (size >= 1); + ASSERT (vlimb != 0); + ASSERT_MPN_NONZERO_P (up, size); + + ulimb = up[0]; + + /* Need vlimb odd for modexact, want it odd to get common zeros. */ + count_trailing_zeros (zero_bits, vlimb); + vlimb >>= zero_bits; + + if (size > 1) + { + /* Must get common zeros before the mod reduction. If ulimb==0 then + vlimb already gives the common zeros. */ + if (ulimb != 0) + { + count_trailing_zeros (u_low_zero_bits, ulimb); + zero_bits = MIN (zero_bits, u_low_zero_bits); + } + + ulimb = MPN_MOD_OR_MODEXACT_1_ODD (up, size, vlimb); + if (ulimb == 0) + goto done; + + count_trailing_zeros (c, ulimb); + ulimb >>= c; + } + else + { + /* size==1, so up[0]!=0 */ + count_trailing_zeros (u_low_zero_bits, ulimb); + ulimb >>= u_low_zero_bits; + zero_bits = MIN (zero_bits, u_low_zero_bits); + + /* make u bigger */ + if (vlimb > ulimb) + MP_LIMB_T_SWAP (ulimb, vlimb); + + /* if u is much bigger than v, reduce using a division rather than + chipping away at it bit-by-bit */ + if ((ulimb >> 16) > vlimb) + { + ulimb %= vlimb; + if (ulimb == 0) + goto done; + + count_trailing_zeros (c, ulimb); + ulimb >>= c; + } + } + + vlimb = mpn_gcd_11 (ulimb, vlimb); + + done: + return vlimb << zero_bits; +} diff --git a/gmp-6.3.0/mpn/generic/gcd_11.c b/gmp-6.3.0/mpn/generic/gcd_11.c new file mode 100644 index 0000000..214e45c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gcd_11.c @@ -0,0 +1,74 @@ +/* mpn_gcd_11 -- limb greatest common divisor. + +Copyright 1994, 1996, 2000, 2001, 2009, 2012, 2019 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_gcd_11 (mp_limb_t u, mp_limb_t v) +{ + ASSERT (u & v & 1); + + /* In this loop, we represent the odd numbers ulimb and vlimb + without the redundant least significant one bit. This reduction + in size by one bit ensures that the high bit of t, below, is set + if and only if vlimb > ulimb. */ + + u >>= 1; + v >>= 1; + + while (u != v) + { + mp_limb_t t; + mp_limb_t vgtu; + int c; + + t = u - v; + vgtu = LIMB_HIGHBIT_TO_MASK (t); + + /* v <-- min (u, v) */ + v += (vgtu & t); + + /* u <-- |u - v| */ + u = (t ^ vgtu) - vgtu; + + count_trailing_zeros (c, t); + /* We have c <= GMP_LIMB_BITS - 2 here, so that + + ulimb >>= (c + 1); + + would be safe. But unlike the addition c + 1, a separate + shift by 1 is independent of c, and can be executed in + parallel with count_trailing_zeros. */ + u = (u >> 1) >> c; + } + return (u << 1) + 1; +} diff --git a/gmp-6.3.0/mpn/generic/gcd_22.c b/gmp-6.3.0/mpn/generic/gcd_22.c new file mode 100644 index 0000000..d97f096 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gcd_22.c @@ -0,0 +1,131 @@ +/* mpn_gcd_22 -- double limb greatest common divisor. + +Copyright 1994, 1996, 2000, 2001, 2009, 2012, 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#if GMP_NAIL_BITS > 0 +#error Nails not supported. +#endif + +mp_double_limb_t +mpn_gcd_22 (mp_limb_t u1, mp_limb_t u0, mp_limb_t v1, mp_limb_t v0) +{ + mp_double_limb_t g; + ASSERT (u0 & v0 & 1); + + /* Implicit least significant bit */ + u0 = (u0 >> 1) | (u1 << (GMP_LIMB_BITS - 1)); + u1 >>= 1; + + v0 = (v0 >> 1) | (v1 << (GMP_LIMB_BITS - 1)); + v1 >>= 1; + + while (u1 || v1) /* u1 == 0 can happen at most twice per call */ + { + mp_limb_t vgtu, t1, t0; + sub_ddmmss (t1, t0, u1, u0, v1, v0); + vgtu = LIMB_HIGHBIT_TO_MASK(t1); + + if (UNLIKELY (t0 == 0)) + { + if (t1 == 0) + { + g.d1 = (u1 << 1) | (u0 >> (GMP_LIMB_BITS - 1)); + g.d0 = (u0 << 1) | 1; + return g; + } + int c; + count_trailing_zeros (c, t1); + + /* v1 = min (u1, v1) */ + v1 += (vgtu & t1); + /* u0 = |u1 - v1| */ + u0 = (t1 ^ vgtu) - vgtu; + ASSERT (c < GMP_LIMB_BITS - 1); + u0 >>= c + 1; + u1 = 0; + } + else + { + int c; + count_trailing_zeros (c, t0); + c++; + /* V <-- min (U, V). + + Assembly version should use cmov. Another alternative, + avoiding carry propagation, would be + + v0 += vgtu & t0; v1 += vtgu & (u1 - v1); + */ + add_ssaaaa (v1, v0, v1, v0, vgtu & t1, vgtu & t0); + /* U <-- |U - V| + No carry handling needed in this conditional negation, + since t0 != 0. */ + u0 = (t0 ^ vgtu) - vgtu; + u1 = t1 ^ vgtu; + if (UNLIKELY (c == GMP_LIMB_BITS)) + { + u0 = u1; + u1 = 0; + } + else + { + u0 = (u0 >> c) | (u1 << (GMP_LIMB_BITS - c)); + u1 >>= c; + } + } + } + while ((v0 | u0) & GMP_LIMB_HIGHBIT) + { /* At most two iterations */ + mp_limb_t vgtu, t0; + int c; + sub_ddmmss (vgtu, t0, 0, u0, 0, v0); + if (UNLIKELY (t0 == 0)) + { + g.d1 = u0 >> (GMP_LIMB_BITS - 1); + g.d0 = (u0 << 1) | 1; + return g; + } + + /* v <-- min (u, v) */ + v0 += (vgtu & t0); + + /* u <-- |u - v| */ + u0 = (t0 ^ vgtu) - vgtu; + + count_trailing_zeros (c, t0); + u0 = (u0 >> 1) >> c; + } + + g.d0 = mpn_gcd_11 ((u0 << 1) + 1, (v0 << 1) + 1); + g.d1 = 0; + return g; +} diff --git a/gmp-6.3.0/mpn/generic/gcd_subdiv_step.c b/gmp-6.3.0/mpn/generic/gcd_subdiv_step.c new file mode 100644 index 0000000..9c3b88d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gcd_subdiv_step.c @@ -0,0 +1,204 @@ +/* gcd_subdiv_step.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003-2005, 2008, 2010, 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include /* for NULL */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Used when mpn_hgcd or mpn_hgcd2 has failed. Then either one of a or + b is small, or the difference is small. Perform one subtraction + followed by one division. The normal case is to compute the reduced + a and b, and return the new size. + + If s == 0 (used for gcd and gcdext), returns zero if the gcd is + found. + + If s > 0, don't reduce to size <= s, and return zero if no + reduction is possible (if either a, b or |a-b| is of size <= s). */ + +/* The hook function is called as + + hook(ctx, gp, gn, qp, qn, d) + + in the following cases: + + + If A = B at the start, G is the gcd, Q is NULL, d = -1. + + + If one input is zero at the start, G is the gcd, Q is NULL, + d = 0 if A = G and d = 1 if B = G. + + Otherwise, if d = 0 we have just subtracted a multiple of A from B, + and if d = 1 we have subtracted a multiple of B from A. + + + If A = B after subtraction, G is the gcd, Q is NULL. + + + If we get a zero remainder after division, G is the gcd, Q is the + quotient. + + + Otherwise, G is NULL, Q is the quotient (often 1). + + */ + +mp_size_t +mpn_gcd_subdiv_step (mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t s, + gcd_subdiv_step_hook *hook, void *ctx, + mp_ptr tp) +{ + static const mp_limb_t one = CNST_LIMB(1); + mp_size_t an, bn, qn; + + int swapped; + + ASSERT (n > 0); + ASSERT (ap[n-1] > 0 || bp[n-1] > 0); + + an = bn = n; + MPN_NORMALIZE (ap, an); + MPN_NORMALIZE (bp, bn); + + swapped = 0; + + /* Arrange so that a < b, subtract b -= a, and maintain + normalization. */ + if (an == bn) + { + int c; + MPN_CMP (c, ap, bp, an); + if (UNLIKELY (c == 0)) + { + /* For gcdext, return the smallest of the two cofactors, so + pass d = -1. */ + if (s == 0) + hook (ctx, ap, an, NULL, 0, -1); + return 0; + } + else if (c > 0) + { + MP_PTR_SWAP (ap, bp); + swapped ^= 1; + } + } + else + { + if (an > bn) + { + MPN_PTR_SWAP (ap, an, bp, bn); + swapped ^= 1; + } + } + if (an <= s) + { + if (s == 0) + hook (ctx, bp, bn, NULL, 0, swapped ^ 1); + return 0; + } + + ASSERT_NOCARRY (mpn_sub (bp, bp, bn, ap, an)); + MPN_NORMALIZE (bp, bn); + ASSERT (bn > 0); + + if (bn <= s) + { + /* Undo subtraction. */ + mp_limb_t cy = mpn_add (bp, ap, an, bp, bn); + if (cy > 0) + bp[an] = cy; + return 0; + } + + /* Arrange so that a < b */ + if (an == bn) + { + int c; + MPN_CMP (c, ap, bp, an); + if (UNLIKELY (c == 0)) + { + if (s > 0) + /* Just record subtraction and return */ + hook (ctx, NULL, 0, &one, 1, swapped); + else + /* Found gcd. */ + hook (ctx, bp, bn, NULL, 0, swapped); + return 0; + } + + hook (ctx, NULL, 0, &one, 1, swapped); + + if (c > 0) + { + MP_PTR_SWAP (ap, bp); + swapped ^= 1; + } + } + else + { + hook (ctx, NULL, 0, &one, 1, swapped); + + if (an > bn) + { + MPN_PTR_SWAP (ap, an, bp, bn); + swapped ^= 1; + } + } + + mpn_tdiv_qr (tp, bp, 0, bp, bn, ap, an); + qn = bn - an + 1; + bn = an; + MPN_NORMALIZE (bp, bn); + + if (UNLIKELY (bn <= s)) + { + if (s == 0) + { + hook (ctx, ap, an, tp, qn, swapped); + return 0; + } + + /* Quotient is one too large, so decrement it and add back A. */ + if (bn > 0) + { + mp_limb_t cy = mpn_add (bp, ap, an, bp, bn); + if (cy) + bp[an++] = cy; + } + else + MPN_COPY (bp, ap, an); + + MPN_DECR_U (tp, qn, 1); + } + + hook (ctx, NULL, 0, tp, qn, swapped); + return an; +} diff --git a/gmp-6.3.0/mpn/generic/gcdext.c b/gmp-6.3.0/mpn/generic/gcdext.c new file mode 100644 index 0000000..5501480 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gcdext.c @@ -0,0 +1,557 @@ +/* mpn_gcdext -- Extended Greatest Common Divisor. + +Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Computes (r;b) = (a; b) M. Result is of size n + M->n +/- 1, and + the size is returned (if inputs are non-normalized, result may be + non-normalized too). Temporary space needed is M->n + n. + */ +static size_t +hgcd_mul_matrix_vector (struct hgcd_matrix *M, + mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n, mp_ptr tp) +{ + mp_limb_t ah, bh; + + /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as + + t = u00 * a + r = u10 * b + r += t; + + t = u11 * b + b = u01 * a + b += t; + */ + + if (M->n >= n) + { + mpn_mul (tp, M->p[0][0], M->n, ap, n); + mpn_mul (rp, M->p[1][0], M->n, bp, n); + } + else + { + mpn_mul (tp, ap, n, M->p[0][0], M->n); + mpn_mul (rp, bp, n, M->p[1][0], M->n); + } + + ah = mpn_add_n (rp, rp, tp, n + M->n); + + if (M->n >= n) + { + mpn_mul (tp, M->p[1][1], M->n, bp, n); + mpn_mul (bp, M->p[0][1], M->n, ap, n); + } + else + { + mpn_mul (tp, bp, n, M->p[1][1], M->n); + mpn_mul (bp, ap, n, M->p[0][1], M->n); + } + bh = mpn_add_n (bp, bp, tp, n + M->n); + + n += M->n; + if ( (ah | bh) > 0) + { + rp[n] = ah; + bp[n] = bh; + n++; + } + else + { + /* Normalize */ + while ( (rp[n-1] | bp[n-1]) == 0) + n--; + } + + return n; +} + +#define COMPUTE_V_ITCH(n) (2*(n)) + +/* Computes |v| = |(g - u a)| / b, where u may be positive or + negative, and v is of the opposite sign. max(a, b) is of size n, u and + v at most size n, and v must have space for n+1 limbs. */ +static mp_size_t +compute_v (mp_ptr vp, + mp_srcptr ap, mp_srcptr bp, mp_size_t n, + mp_srcptr gp, mp_size_t gn, + mp_srcptr up, mp_size_t usize, + mp_ptr tp) +{ + mp_size_t size; + mp_size_t an; + mp_size_t bn; + mp_size_t vn; + + ASSERT (n > 0); + ASSERT (gn > 0); + ASSERT (usize != 0); + + size = ABS (usize); + ASSERT (size <= n); + ASSERT (up[size-1] > 0); + + an = n; + MPN_NORMALIZE (ap, an); + ASSERT (gn <= an); + + if (an >= size) + mpn_mul (tp, ap, an, up, size); + else + mpn_mul (tp, up, size, ap, an); + + size += an; + + if (usize > 0) + { + /* |v| = -v = (u a - g) / b */ + + ASSERT_NOCARRY (mpn_sub (tp, tp, size, gp, gn)); + MPN_NORMALIZE (tp, size); + if (size == 0) + return 0; + } + else + { /* |v| = v = (g - u a) / b = (g + |u| a) / b. Since g <= a, + (g + |u| a) always fits in (|usize| + an) limbs. */ + + ASSERT_NOCARRY (mpn_add (tp, tp, size, gp, gn)); + size -= (tp[size - 1] == 0); + } + + /* Now divide t / b. There must be no remainder */ + bn = n; + MPN_NORMALIZE (bp, bn); + ASSERT (size >= bn); + + vn = size + 1 - bn; + ASSERT (vn <= n + 1); + + mpn_divexact (vp, tp, size, bp, bn); + vn -= (vp[vn-1] == 0); + + return vn; +} + +/* Temporary storage: + + Initial division: Quotient of at most an - n + 1 <= an limbs. + + Storage for u0 and u1: 2(n+1). + + Storage for hgcd matrix M, with input ceil(n/2): 5 * ceil(n/4) + + Storage for hgcd, input (n + 1)/2: 9 n/4 plus some. + + When hgcd succeeds: 1 + floor(3n/2) for adjusting a and b, and 2(n+1) for the cofactors. + + When hgcd fails: 2n + 1 for mpn_gcdext_subdiv_step, which is less. + + For the lehmer call after the loop, Let T denote + GCDEXT_DC_THRESHOLD. For the gcdext_lehmer call, we need T each for + u, a and b, and 4T+3 scratch space. Next, for compute_v, we need T + for u, T+1 for v and 2T scratch space. In all, 7T + 3 is + sufficient for both operations. + +*/ + +/* Optimal choice of p seems difficult. In each iteration the division + * of work between hgcd and the updates of u0 and u1 depends on the + * current size of the u. It may be desirable to use a different + * choice of p in each iteration. Also the input size seems to matter; + * choosing p = n / 3 in the first iteration seems to improve + * performance slightly for input size just above the threshold, but + * degrade performance for larger inputs. */ +#define CHOOSE_P_1(n) ((n) / 2) +#define CHOOSE_P_2(n) ((n) / 3) + +mp_size_t +mpn_gcdext (mp_ptr gp, mp_ptr up, mp_size_t *usizep, + mp_ptr ap, mp_size_t an, mp_ptr bp, mp_size_t n) +{ + mp_size_t talloc; + mp_size_t scratch; + mp_size_t matrix_scratch; + mp_size_t ualloc = n + 1; + + struct gcdext_ctx ctx; + mp_size_t un; + mp_ptr u0; + mp_ptr u1; + + mp_ptr tp; + + TMP_DECL; + + ASSERT (an >= n); + ASSERT (n > 0); + ASSERT (bp[n-1] > 0); + + TMP_MARK; + + /* FIXME: Check for small sizes first, before setting up temporary + storage etc. */ + talloc = MPN_GCDEXT_LEHMER_N_ITCH(n); + + /* For initial division */ + scratch = an - n + 1; + if (scratch > talloc) + talloc = scratch; + + if (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) + { + /* For hgcd loop. */ + mp_size_t hgcd_scratch; + mp_size_t update_scratch; + mp_size_t p1 = CHOOSE_P_1 (n); + mp_size_t p2 = CHOOSE_P_2 (n); + mp_size_t min_p = MIN(p1, p2); + mp_size_t max_p = MAX(p1, p2); + matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - min_p); + hgcd_scratch = mpn_hgcd_itch (n - min_p); + update_scratch = max_p + n - 1; + + scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); + if (scratch > talloc) + talloc = scratch; + + /* Final mpn_gcdext_lehmer_n call. Need space for u and for + copies of a and b. */ + scratch = MPN_GCDEXT_LEHMER_N_ITCH (GCDEXT_DC_THRESHOLD) + + 3*GCDEXT_DC_THRESHOLD; + + if (scratch > talloc) + talloc = scratch; + + /* Cofactors u0 and u1 */ + talloc += 2*(n+1); + } + + tp = TMP_ALLOC_LIMBS(talloc); + + if (an > n) + { + mpn_tdiv_qr (tp, ap, 0, ap, an, bp, n); + + if (mpn_zero_p (ap, n)) + { + MPN_COPY (gp, bp, n); + *usizep = 0; + TMP_FREE; + return n; + } + } + + if (BELOW_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) + { + mp_size_t gn = mpn_gcdext_lehmer_n(gp, up, usizep, ap, bp, n, tp); + + TMP_FREE; + return gn; + } + + MPN_ZERO (tp, 2*ualloc); + u0 = tp; tp += ualloc; + u1 = tp; tp += ualloc; + + ctx.gp = gp; + ctx.up = up; + ctx.usize = usizep; + + { + /* For the first hgcd call, there are no u updates, and it makes + some sense to use a different choice for p. */ + + /* FIXME: We could trim use of temporary storage, since u0 and u1 + are not used yet. For the hgcd call, we could swap in the u0 + and u1 pointers for the relevant matrix elements. */ + + struct hgcd_matrix M; + mp_size_t p = CHOOSE_P_1 (n); + mp_size_t nn; + + mpn_hgcd_matrix_init (&M, n - p, tp); + nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); + if (nn > 0) + { + ASSERT (M.n <= (n - p - 1)/2); + ASSERT (M.n + p <= (p + n - 1) / 2); + + /* Temporary storage 2 (p + M->n) <= p + n - 1 */ + n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch); + + MPN_COPY (u0, M.p[1][0], M.n); + MPN_COPY (u1, M.p[1][1], M.n); + un = M.n; + while ( (u0[un-1] | u1[un-1] ) == 0) + un--; + } + else + { + /* mpn_hgcd has failed. Then either one of a or b is very + small, or the difference is very small. Perform one + subtraction followed by one division. */ + u1[0] = 1; + + ctx.u0 = u0; + ctx.u1 = u1; + ctx.tp = tp + n; /* ualloc */ + ctx.un = 1; + + /* Temporary storage n */ + n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp); + if (n == 0) + { + TMP_FREE; + return ctx.gn; + } + + un = ctx.un; + ASSERT (un < ualloc); + } + } + + while (ABOVE_THRESHOLD (n, GCDEXT_DC_THRESHOLD)) + { + struct hgcd_matrix M; + mp_size_t p = CHOOSE_P_2 (n); + mp_size_t nn; + + mpn_hgcd_matrix_init (&M, n - p, tp); + nn = mpn_hgcd (ap + p, bp + p, n - p, &M, tp + matrix_scratch); + if (nn > 0) + { + mp_ptr t0; + + t0 = tp + matrix_scratch; + ASSERT (M.n <= (n - p - 1)/2); + ASSERT (M.n + p <= (p + n - 1) / 2); + + /* Temporary storage 2 (p + M->n) <= p + n - 1 */ + n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, t0); + + /* By the same analysis as for mpn_hgcd_matrix_mul */ + ASSERT (M.n + un <= ualloc); + + /* FIXME: This copying could be avoided by some swapping of + * pointers. May need more temporary storage, though. */ + MPN_COPY (t0, u0, un); + + /* Temporary storage ualloc */ + un = hgcd_mul_matrix_vector (&M, u0, t0, u1, un, t0 + un); + + ASSERT (un < ualloc); + ASSERT ( (u0[un-1] | u1[un-1]) > 0); + } + else + { + /* mpn_hgcd has failed. Then either one of a or b is very + small, or the difference is very small. Perform one + subtraction followed by one division. */ + ctx.u0 = u0; + ctx.u1 = u1; + ctx.tp = tp + n; /* ualloc */ + ctx.un = un; + + /* Temporary storage n */ + n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp); + if (n == 0) + { + TMP_FREE; + return ctx.gn; + } + + un = ctx.un; + ASSERT (un < ualloc); + } + } + /* We have A = ... a + ... b + B = u0 a + u1 b + + a = u1 A + ... B + b = -u0 A + ... B + + with bounds + + |u0|, |u1| <= B / min(a, b) + + We always have u1 > 0, and u0 == 0 is possible only if u1 == 1, + in which case the only reduction done so far is a = A - k B for + some k. + + Compute g = u a + v b = (u u1 - v u0) A + (...) B + Here, u, v are bounded by + + |u| <= b, + |v| <= a + */ + + ASSERT ( (ap[n-1] | bp[n-1]) > 0); + + if (UNLIKELY (mpn_cmp (ap, bp, n) == 0)) + { + /* Must return the smallest cofactor, +u1 or -u0 */ + int c; + + MPN_COPY (gp, ap, n); + + MPN_CMP (c, u0, u1, un); + /* c == 0 can happen only when A = (2k+1) G, B = 2 G. And in + this case we choose the cofactor + 1, corresponding to G = A + - k B, rather than -1, corresponding to G = - A + (k+1) B. */ + ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1)); + if (c < 0) + { + MPN_NORMALIZE (u0, un); + MPN_COPY (up, u0, un); + *usizep = -un; + } + else + { + MPN_NORMALIZE_NOT_ZERO (u1, un); + MPN_COPY (up, u1, un); + *usizep = un; + } + + TMP_FREE; + return n; + } + else if (UNLIKELY (u0[0] == 0) && un == 1) + { + mp_size_t gn; + ASSERT (u1[0] == 1); + + /* g = u a + v b = (u u1 - v u0) A + (...) B = u A + (...) B */ + gn = mpn_gcdext_lehmer_n (gp, up, usizep, ap, bp, n, tp); + + TMP_FREE; + return gn; + } + else + { + mp_size_t u0n; + mp_size_t u1n; + mp_size_t lehmer_un; + mp_size_t lehmer_vn; + mp_size_t gn; + + mp_ptr lehmer_up; + mp_ptr lehmer_vp; + int negate; + + lehmer_up = tp; tp += n; + + /* Call mpn_gcdext_lehmer_n with copies of a and b. */ + MPN_COPY (tp, ap, n); + MPN_COPY (tp + n, bp, n); + gn = mpn_gcdext_lehmer_n (gp, lehmer_up, &lehmer_un, tp, tp + n, n, tp + 2*n); + + u0n = un; + MPN_NORMALIZE (u0, u0n); + ASSERT (u0n > 0); + + if (lehmer_un == 0) + { + /* u == 0 ==> v = g / b == 1 ==> g = - u0 A + (...) B */ + MPN_COPY (up, u0, u0n); + *usizep = -u0n; + + TMP_FREE; + return gn; + } + + lehmer_vp = tp; + /* Compute v = (g - u a) / b */ + lehmer_vn = compute_v (lehmer_vp, + ap, bp, n, gp, gn, lehmer_up, lehmer_un, tp + n + 1); + + if (lehmer_un > 0) + negate = 0; + else + { + lehmer_un = -lehmer_un; + negate = 1; + } + + u1n = un; + MPN_NORMALIZE (u1, u1n); + ASSERT (u1n > 0); + + ASSERT (lehmer_un + u1n <= ualloc); + ASSERT (lehmer_vn + u0n <= ualloc); + + /* We may still have v == 0 */ + + /* Compute u u0 */ + if (lehmer_un <= u1n) + /* Should be the common case */ + mpn_mul (up, u1, u1n, lehmer_up, lehmer_un); + else + mpn_mul (up, lehmer_up, lehmer_un, u1, u1n); + + un = u1n + lehmer_un; + un -= (up[un - 1] == 0); + + if (lehmer_vn > 0) + { + mp_limb_t cy; + + /* Overwrites old u1 value */ + if (lehmer_vn <= u0n) + /* Should be the common case */ + mpn_mul (u1, u0, u0n, lehmer_vp, lehmer_vn); + else + mpn_mul (u1, lehmer_vp, lehmer_vn, u0, u0n); + + u1n = u0n + lehmer_vn; + u1n -= (u1[u1n - 1] == 0); + + if (u1n <= un) + { + cy = mpn_add (up, up, un, u1, u1n); + } + else + { + cy = mpn_add (up, u1, u1n, up, un); + un = u1n; + } + up[un] = cy; + un += (cy != 0); + + ASSERT (un < ualloc); + } + *usizep = negate ? -un : un; + + TMP_FREE; + return gn; + } +} diff --git a/gmp-6.3.0/mpn/generic/gcdext_1.c b/gmp-6.3.0/mpn/generic/gcdext_1.c new file mode 100644 index 0000000..b221a92 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gcdext_1.c @@ -0,0 +1,275 @@ +/* mpn_gcdext -- Extended Greatest Common Divisor. + +Copyright 1996, 1998, 2000-2005, 2008, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef GCDEXT_1_USE_BINARY +#define GCDEXT_1_USE_BINARY 0 +#endif + +#ifndef GCDEXT_1_BINARY_METHOD +#define GCDEXT_1_BINARY_METHOD 2 +#endif + +#if GCDEXT_1_USE_BINARY + +mp_limb_t +mpn_gcdext_1 (mp_limb_signed_t *sp, mp_limb_signed_t *tp, + mp_limb_t u, mp_limb_t v) +{ + /* Maintain + + U = t1 u + t0 v + V = s1 u + s0 v + + where U, V are the inputs (without any shared power of two), + and the matrix has determinant ± 2^{shift}. + */ + mp_limb_t s0 = 1; + mp_limb_t t0 = 0; + mp_limb_t s1 = 0; + mp_limb_t t1 = 1; + mp_limb_t ug; + mp_limb_t vg; + mp_limb_t ugh; + mp_limb_t vgh; + unsigned zero_bits; + unsigned shift; + unsigned i; +#if GCDEXT_1_BINARY_METHOD == 2 + mp_limb_t det_sign; +#endif + + ASSERT (u > 0); + ASSERT (v > 0); + + count_trailing_zeros (zero_bits, u | v); + u >>= zero_bits; + v >>= zero_bits; + + if ((u & 1) == 0) + { + count_trailing_zeros (shift, u); + u >>= shift; + t1 <<= shift; + } + else if ((v & 1) == 0) + { + count_trailing_zeros (shift, v); + v >>= shift; + s0 <<= shift; + } + else + shift = 0; + +#if GCDEXT_1_BINARY_METHOD == 1 + while (u != v) + { + unsigned count; + if (u > v) + { + u -= v; + + count_trailing_zeros (count, u); + u >>= count; + + t0 += t1; t1 <<= count; + s0 += s1; s1 <<= count; + } + else + { + v -= u; + + count_trailing_zeros (count, v); + v >>= count; + + t1 += t0; t0 <<= count; + s1 += s0; s0 <<= count; + } + shift += count; + } +#else +# if GCDEXT_1_BINARY_METHOD == 2 + u >>= 1; + v >>= 1; + + det_sign = 0; + + while (u != v) + { + unsigned count; + mp_limb_t d = u - v; + mp_limb_t vgtu = LIMB_HIGHBIT_TO_MASK (d); + mp_limb_t sx; + mp_limb_t tx; + + /* When v <= u (vgtu == 0), the updates are: + + (u; v) <-- ( (u - v) >> count; v) (det = +(1< 0, the updates are + + (u; v) <-- ( (v - u) >> count; u) (det = -(1<>= count; + t1 <<= count; + s1 <<= count; + shift += count; + } + u = (u << 1) + 1; +# else /* GCDEXT_1_BINARY_METHOD == 2 */ +# error Unknown GCDEXT_1_BINARY_METHOD +# endif +#endif + + /* Now u = v = g = gcd (u,v). Compute U/g and V/g */ + ug = t0 + t1; + vg = s0 + s1; + + ugh = ug/2 + (ug & 1); + vgh = vg/2 + (vg & 1); + + /* Now 2^{shift} g = s0 U - t0 V. Get rid of the power of two, using + s0 U - t0 V = (s0 + V/g) U - (t0 + U/g) V. */ + for (i = 0; i < shift; i++) + { + mp_limb_t mask = - ( (s0 | t0) & 1); + + s0 /= 2; + t0 /= 2; + s0 += mask & vgh; + t0 += mask & ugh; + } + + ASSERT_ALWAYS (s0 <= vg); + ASSERT_ALWAYS (t0 <= ug); + + if (s0 > vg - s0) + { + s0 -= vg; + t0 -= ug; + } +#if GCDEXT_1_BINARY_METHOD == 2 + /* Conditional negation. */ + s0 = (s0 ^ det_sign) - det_sign; + t0 = (t0 ^ det_sign) - det_sign; +#endif + *sp = s0; + *tp = -t0; + + return u << zero_bits; +} + +#else /* !GCDEXT_1_USE_BINARY */ + + +/* FIXME: Takes two single-word limbs. It could be extended to a + * function that accepts a bignum for the first input, and only + * returns the first co-factor. */ + +mp_limb_t +mpn_gcdext_1 (mp_limb_signed_t *up, mp_limb_signed_t *vp, + mp_limb_t a, mp_limb_t b) +{ + /* Maintain + + a = u0 A + v0 B + b = u1 A + v1 B + + where A, B are the original inputs. + */ + mp_limb_signed_t u0 = 1; + mp_limb_signed_t v0 = 0; + mp_limb_signed_t u1 = 0; + mp_limb_signed_t v1 = 1; + + ASSERT (a > 0); + ASSERT (b > 0); + + if (a < b) + goto divide_by_b; + + for (;;) + { + mp_limb_t q; + + q = a / b; + a -= q * b; + + if (a == 0) + { + *up = u1; + *vp = v1; + return b; + } + u0 -= q * u1; + v0 -= q * v1; + + divide_by_b: + q = b / a; + b -= q * a; + + if (b == 0) + { + *up = u0; + *vp = v0; + return a; + } + u1 -= q * u0; + v1 -= q * v0; + } +} +#endif /* !GCDEXT_1_USE_BINARY */ diff --git a/gmp-6.3.0/mpn/generic/gcdext_lehmer.c b/gmp-6.3.0/mpn/generic/gcdext_lehmer.c new file mode 100644 index 0000000..ea4e86d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gcdext_lehmer.c @@ -0,0 +1,336 @@ +/* mpn_gcdext -- Extended Greatest Common Divisor. + +Copyright 1996, 1998, 2000-2005, 2008, 2009, 2012 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Here, d is the index of the cofactor to update. FIXME: Could use qn + = 0 for the common case q = 1. */ +void +mpn_gcdext_hook (void *p, mp_srcptr gp, mp_size_t gn, + mp_srcptr qp, mp_size_t qn, int d) +{ + struct gcdext_ctx *ctx = (struct gcdext_ctx *) p; + mp_size_t un = ctx->un; + + if (gp) + { + mp_srcptr up; + + ASSERT (gn > 0); + ASSERT (gp[gn-1] > 0); + + MPN_COPY (ctx->gp, gp, gn); + ctx->gn = gn; + + if (d < 0) + { + int c; + + /* Must return the smallest cofactor, +u1 or -u0 */ + MPN_CMP (c, ctx->u0, ctx->u1, un); + ASSERT (c != 0 || (un == 1 && ctx->u0[0] == 1 && ctx->u1[0] == 1)); + + d = c < 0; + } + + up = d ? ctx->u0 : ctx->u1; + + MPN_NORMALIZE (up, un); + MPN_COPY (ctx->up, up, un); + + *ctx->usize = d ? -un : un; + } + else + { + mp_limb_t cy; + mp_ptr u0 = ctx->u0; + mp_ptr u1 = ctx->u1; + + ASSERT (d >= 0); + + if (d) + MP_PTR_SWAP (u0, u1); + + qn -= (qp[qn-1] == 0); + + /* Update u0 += q * u1 */ + if (qn == 1) + { + mp_limb_t q = qp[0]; + + if (q == 1) + /* A common case. */ + cy = mpn_add_n (u0, u0, u1, un); + else + cy = mpn_addmul_1 (u0, u1, un, q); + } + else + { + mp_size_t u1n; + mp_ptr tp; + + u1n = un; + MPN_NORMALIZE (u1, u1n); + + if (u1n == 0) + return; + + /* Should always have u1n == un here, and u1 >= u0. The + reason is that we alternate adding u0 to u1 and u1 to u0 + (corresponding to subtractions a - b and b - a), and we + can get a large quotient only just after a switch, which + means that we'll add (a multiple of) the larger u to the + smaller. */ + + tp = ctx->tp; + + if (qn > u1n) + mpn_mul (tp, qp, qn, u1, u1n); + else + mpn_mul (tp, u1, u1n, qp, qn); + + u1n += qn; + u1n -= tp[u1n-1] == 0; + + if (u1n >= un) + { + cy = mpn_add (u0, tp, u1n, u0, un); + un = u1n; + } + else + /* Note: Unlikely case, maybe never happens? */ + cy = mpn_add (u0, u0, un, tp, u1n); + + } + u0[un] = cy; + ctx->un = un + (cy > 0); + } +} + +/* Temporary storage: 3*(n+1) for u. If hgcd2 succeeds, we need n for + the matrix-vector multiplication adjusting a, b. If hgcd fails, we + need at most n for the quotient and n+1 for the u update (reusing + the extra u). In all, 4n + 3. */ + +mp_size_t +mpn_gcdext_lehmer_n (mp_ptr gp, mp_ptr up, mp_size_t *usize, + mp_ptr ap, mp_ptr bp, mp_size_t n, + mp_ptr tp) +{ + mp_size_t ualloc = n + 1; + + /* Keeps track of the second row of the reduction matrix + * + * M = (v0, v1 ; u0, u1) + * + * which correspond to the first column of the inverse + * + * M^{-1} = (u1, -v1; -u0, v0) + * + * This implies that + * + * a = u1 A (mod B) + * b = -u0 A (mod B) + * + * where A, B denotes the input values. + */ + + struct gcdext_ctx ctx; + mp_size_t un; + mp_ptr u0; + mp_ptr u1; + mp_ptr u2; + + MPN_ZERO (tp, 3*ualloc); + u0 = tp; tp += ualloc; + u1 = tp; tp += ualloc; + u2 = tp; tp += ualloc; + + u1[0] = 1; un = 1; + + ctx.gp = gp; + ctx.up = up; + ctx.usize = usize; + + /* FIXME: Handle n == 2 differently, after the loop? */ + while (n >= 2) + { + struct hgcd_matrix1 M; + mp_limb_t ah, al, bh, bl; + mp_limb_t mask; + + mask = ap[n-1] | bp[n-1]; + ASSERT (mask > 0); + + if (mask & GMP_NUMB_HIGHBIT) + { + ah = ap[n-1]; al = ap[n-2]; + bh = bp[n-1]; bl = bp[n-2]; + } + else if (n == 2) + { + /* We use the full inputs without truncation, so we can + safely shift left. */ + int shift; + + count_leading_zeros (shift, mask); + ah = MPN_EXTRACT_NUMB (shift, ap[1], ap[0]); + al = ap[0] << shift; + bh = MPN_EXTRACT_NUMB (shift, bp[1], bp[0]); + bl = bp[0] << shift; + } + else + { + int shift; + + count_leading_zeros (shift, mask); + ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]); + al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]); + bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]); + bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]); + } + + /* Try an mpn_nhgcd2 step */ + if (mpn_hgcd2 (ah, al, bh, bl, &M)) + { + n = mpn_matrix22_mul1_inverse_vector (&M, tp, ap, bp, n); + MP_PTR_SWAP (ap, tp); + un = mpn_hgcd_mul_matrix1_vector(&M, u2, u0, u1, un); + MP_PTR_SWAP (u0, u2); + } + else + { + /* mpn_hgcd2 has failed. Then either one of a or b is very + small, or the difference is very small. Perform one + subtraction followed by one division. */ + ctx.u0 = u0; + ctx.u1 = u1; + ctx.tp = u2; + ctx.un = un; + + /* Temporary storage n for the quotient and ualloc for the + new cofactor. */ + n = mpn_gcd_subdiv_step (ap, bp, n, 0, mpn_gcdext_hook, &ctx, tp); + if (n == 0) + return ctx.gn; + + un = ctx.un; + } + } + ASSERT_ALWAYS (ap[0] > 0); + ASSERT_ALWAYS (bp[0] > 0); + + if (ap[0] == bp[0]) + { + int c; + + /* Which cofactor to return now? Candidates are +u1 and -u0, + depending on which of a and b was most recently reduced, + which we don't keep track of. So compare and get the smallest + one. */ + + gp[0] = ap[0]; + + MPN_CMP (c, u0, u1, un); + ASSERT (c != 0 || (un == 1 && u0[0] == 1 && u1[0] == 1)); + if (c < 0) + { + MPN_NORMALIZE (u0, un); + MPN_COPY (up, u0, un); + *usize = -un; + } + else + { + MPN_NORMALIZE_NOT_ZERO (u1, un); + MPN_COPY (up, u1, un); + *usize = un; + } + return 1; + } + else + { + mp_limb_t uh, vh; + mp_limb_signed_t u; + mp_limb_signed_t v; + int negate; + + gp[0] = mpn_gcdext_1 (&u, &v, ap[0], bp[0]); + + /* Set up = u u1 - v u0. Keep track of size, un grows by one or + two limbs. */ + + if (u == 0) + { + ASSERT (v == 1); + MPN_NORMALIZE (u0, un); + MPN_COPY (up, u0, un); + *usize = -un; + return 1; + } + else if (v == 0) + { + ASSERT (u == 1); + MPN_NORMALIZE (u1, un); + MPN_COPY (up, u1, un); + *usize = un; + return 1; + } + else if (u > 0) + { + negate = 0; + ASSERT (v < 0); + v = -v; + } + else + { + negate = 1; + ASSERT (v > 0); + u = -u; + } + + uh = mpn_mul_1 (up, u1, un, u); + vh = mpn_addmul_1 (up, u0, un, v); + + if ( (uh | vh) > 0) + { + uh += vh; + up[un++] = uh; + if (uh < vh) + up[un++] = 1; + } + + MPN_NORMALIZE_NOT_ZERO (up, un); + + *usize = negate ? -un : un; + return 1; + } +} diff --git a/gmp-6.3.0/mpn/generic/get_d.c b/gmp-6.3.0/mpn/generic/get_d.c new file mode 100644 index 0000000..8bef128 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/get_d.c @@ -0,0 +1,438 @@ +/* mpn_get_d -- limbs to double conversion. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2003, 2004, 2007, 2009, 2010, 2012, 2018 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "config.h" + +#if HAVE_FLOAT_H +#include /* for DBL_MANT_DIG and FLT_RADIX */ +#endif + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef _GMP_IEEE_FLOATS +#define _GMP_IEEE_FLOATS 0 +#endif + +/* To force use of the generic C code for testing, put + "#define _GMP_IEEE_FLOATS 0" at this point. */ + + +/* In alpha gcc prior to 3.4, signed DI comparisons involving constants are + rearranged from "x < n" to "x+(-n) < 0", which is of course hopelessly + wrong if that addition overflows. + + The workaround here avoids this bug by ensuring n is not a literal constant. + Note that this is alpha specific. The offending transformation is/was in + alpha.c alpha_emit_conditional_branch() under "We want to use cmpcc/bcc". + + Bizarrely, this happens also with Cray cc on alphaev5-cray-unicosmk2.0.6.X, + and has the same solution. Don't know why or how. */ + +#if HAVE_HOST_CPU_FAMILY_alpha \ + && ((defined (__GNUC__) && ! __GMP_GNUC_PREREQ(3,4)) \ + || defined (_CRAY)) +static volatile const long CONST_1024 = 1024; +static volatile const long CONST_NEG_1023 = -1023; +static volatile const long CONST_NEG_1022_SUB_53 = -1022 - 53; +#else +#define CONST_1024 (1024) +#define CONST_NEG_1023 (-1023) +#define CONST_NEG_1022_SUB_53 (-1022 - 53) +#endif + + +/* Return the value {ptr,size}*2^exp, and negative if sign<0. Must have + size>=1, and a non-zero high limb ptr[size-1]. + + When we know the fp format, the result is truncated towards zero. This is + consistent with other gmp conversions, like mpz_set_f or mpz_set_q, and is + easy to implement and test. + + When we do not know the format, such truncation seems much harder. One + would need to defeat any rounding mode, including round-up. + + It's felt that GMP is not primarily concerned with hardware floats, and + really isn't enhanced by getting involved with hardware rounding modes + (which could even be some weird unknown style), so something unambiguous and + straightforward is best. + + + The IEEE code below is the usual case, it knows either a 32-bit or 64-bit + limb and is done with shifts and masks. The 64-bit case in particular + should come out nice and compact. + + The generic code used to work one bit at a time, which was not only slow, + but implicitly relied upon denorms for intermediates, since the lowest bits' + weight of a perfectly valid fp number underflows in non-denorm. Therefore, + the generic code now works limb-per-limb, initially creating a number x such + that 1 <= x <= BASE. (BASE is reached only as result of rounding.) Then + x's exponent is scaled with explicit code (not ldexp to avoid libm + dependency). It is a tap-dance to avoid underflow or overflow, beware! + + + Traps: + + Hardware traps for overflow to infinity, underflow to zero, or unsupported + denorms may or may not be taken. The IEEE code works bitwise and so + probably won't trigger them, the generic code works by float operations and + so probably will. This difference might be thought less than ideal, but + again its felt straightforward code is better than trying to get intimate + with hardware exceptions (of perhaps unknown nature). + + + Not done: + + mpz_get_d in the past handled size==1 with a cast limb->double. This might + still be worthwhile there (for up to the mantissa many bits), but for + mpn_get_d here, the cost of applying "exp" to the resulting exponent would + probably use up any benefit a cast may have over bit twiddling. Also, if + the exponent is pushed into denorm range then bit twiddling is the only + option, to ensure the desired truncation is obtained. + + + Other: + + For reference, note that HPPA 8000, 8200, 8500 and 8600 trap FCNV,UDW,DBL + to the kernel for values >= 2^63. This makes it slow, and worse the kernel + Linux (what versions?) apparently uses untested code in its trap handling + routines, and gets the sign wrong. We don't use such a limb-to-double + cast, neither in the IEEE or generic code. */ + + + +#undef FORMAT_RECOGNIZED + +double +mpn_get_d (mp_srcptr up, mp_size_t size, mp_size_t sign, long exp) +{ + int lshift, nbits; + mp_limb_t x, mhi, mlo; + + ASSERT (size >= 0); + ASSERT_MPN (up, size); + ASSERT (size == 0 || up[size-1] != 0); + + if (size == 0) + return 0.0; + + /* Adjust exp to a radix point just above {up,size}, guarding against + overflow. After this exp can of course be reduced to anywhere within + the {up,size} region without underflow. */ + if (UNLIKELY ((unsigned long) (GMP_NUMB_BITS * size) + > ((unsigned long) LONG_MAX - exp))) + { +#if _GMP_IEEE_FLOATS + goto ieee_infinity; +#endif + + /* generic */ + exp = LONG_MAX; + } + else + { + exp += GMP_NUMB_BITS * size; + } + +#if _GMP_IEEE_FLOATS + { + union ieee_double_extract u; + + up += size; + +#if GMP_LIMB_BITS == 64 + mlo = up[-1]; + count_leading_zeros (lshift, mlo); + + exp -= (lshift - GMP_NAIL_BITS) + 1; + mlo <<= lshift; + + nbits = GMP_LIMB_BITS - lshift; + + if (nbits < 53 && size > 1) + { + x = up[-2]; + x <<= GMP_NAIL_BITS; + x >>= nbits; + mlo |= x; + nbits += GMP_NUMB_BITS; + + if (LIMBS_PER_DOUBLE >= 3 && nbits < 53 && size > 2) + { + x = up[-3]; + x <<= GMP_NAIL_BITS; + x >>= nbits; + mlo |= x; + nbits += GMP_NUMB_BITS; + } + } + mhi = mlo >> (32 + 11); + mlo = mlo >> 11; /* later implicitly truncated to 32 bits */ +#endif +#if GMP_LIMB_BITS == 32 + x = *--up; + count_leading_zeros (lshift, x); + + exp -= (lshift - GMP_NAIL_BITS) + 1; + x <<= lshift; + mhi = x >> 11; + + if (lshift < 11) /* FIXME: never true if NUMB < 20 bits */ + { + /* All 20 bits in mhi */ + mlo = x << 21; + /* >= 1 bit in mlo */ + nbits = GMP_LIMB_BITS - lshift - 21; + } + else + { + if (size > 1) + { + nbits = GMP_LIMB_BITS - lshift; + + x = *--up, size--; + x <<= GMP_NAIL_BITS; + mhi |= x >> nbits >> 11; + + mlo = x << (GMP_LIMB_BITS - nbits - 11); + nbits = nbits + 11 - GMP_NAIL_BITS; + } + else + { + mlo = 0; + goto done; + } + } + + /* Now all needed bits in mhi have been accumulated. Add bits to mlo. */ + + if (LIMBS_PER_DOUBLE >= 2 && nbits < 32 && size > 1) + { + x = up[-1]; + x <<= GMP_NAIL_BITS; + x >>= nbits; + mlo |= x; + nbits += GMP_NUMB_BITS; + + if (LIMBS_PER_DOUBLE >= 3 && nbits < 32 && size > 2) + { + x = up[-2]; + x <<= GMP_NAIL_BITS; + x >>= nbits; + mlo |= x; + nbits += GMP_NUMB_BITS; + + if (LIMBS_PER_DOUBLE >= 4 && nbits < 32 && size > 3) + { + x = up[-3]; + x <<= GMP_NAIL_BITS; + x >>= nbits; + mlo |= x; + nbits += GMP_NUMB_BITS; + } + } + } + + done:; + +#endif + if (UNLIKELY (exp >= CONST_1024)) + { + /* overflow, return infinity */ + ieee_infinity: + mhi = 0; + mlo = 0; + exp = 1024; + } + else if (UNLIKELY (exp <= CONST_NEG_1023)) + { + int rshift; + + if (LIKELY (exp <= CONST_NEG_1022_SUB_53)) + return 0.0; /* denorm underflows to zero */ + + rshift = -1022 - exp; + ASSERT (rshift > 0 && rshift < 53); +#if GMP_LIMB_BITS > 53 + mlo >>= rshift; + mhi = mlo >> 32; +#else + if (rshift >= 32) + { + mlo = mhi; + mhi = 0; + rshift -= 32; + } + lshift = GMP_LIMB_BITS - rshift; + mlo = (mlo >> rshift) | (rshift == 0 ? 0 : mhi << lshift); + mhi >>= rshift; +#endif + exp = -1023; + } + u.s.manh = mhi; + u.s.manl = mlo; + u.s.exp = exp + 1023; + u.s.sig = (sign < 0); + return u.d; + } +#define FORMAT_RECOGNIZED 1 +#endif + +#if HAVE_DOUBLE_VAX_D + { + union double_extract u; + + up += size; + + mhi = up[-1]; + + count_leading_zeros (lshift, mhi); + exp -= lshift; + mhi <<= lshift; + + mlo = 0; + if (size > 1) + { + mlo = up[-2]; + if (lshift != 0) + mhi += mlo >> (GMP_LIMB_BITS - lshift); + mlo <<= lshift; + + if (size > 2 && lshift > 8) + { + x = up[-3]; + mlo += x >> (GMP_LIMB_BITS - lshift); + } + } + + if (UNLIKELY (exp >= 128)) + { + /* overflow, return maximum number */ + mhi = 0xffffffff; + mlo = 0xffffffff; + exp = 127; + } + else if (UNLIKELY (exp < -128)) + { + return 0.0; /* underflows to zero */ + } + + u.s.man3 = mhi >> 24; /* drop msb, since implicit */ + u.s.man2 = mhi >> 8; + u.s.man1 = (mhi << 8) + (mlo >> 24); + u.s.man0 = mlo >> 8; + u.s.exp = exp + 128; + u.s.sig = sign < 0; + return u.d; + } +#define FORMAT_RECOGNIZED 1 +#endif + +#if ! FORMAT_RECOGNIZED + +#if !defined(GMP_DBL_MANT_BITS) +#if defined(DBL_MANT_DIG) && FLT_RADIX == 2 +#define GMP_DBL_MANT_BITS DBL_MANT_DIG +#else +/* FIXME: Chose a smarter default value. */ +#define GMP_DBL_MANT_BITS (16 * sizeof (double)) +#endif +#endif + + { /* Non-IEEE or strange limb size, generically convert + GMP_DBL_MANT_BITS bits. */ + mp_limb_t l; + int m; + mp_size_t i; + double d, weight; + unsigned long uexp; + + /* First generate an fp number disregarding exp, instead keeping things + within the numb base factor from 1, which should prevent overflow and + underflow even for the most exponent limited fp formats. */ + i = size - 1; + l = up[i]; + count_leading_zeros (m, l); + m = m + GMP_DBL_MANT_BITS - GMP_LIMB_BITS; + if (m < 0) + l &= GMP_NUMB_MAX << -m; + d = l; + for (weight = 1/MP_BASE_AS_DOUBLE; m > 0 && --i >= 0;) + { + l = up[i]; + m -= GMP_NUMB_BITS; + if (m < 0) + l &= GMP_NUMB_MAX << -m; + d += l * weight; + weight /= MP_BASE_AS_DOUBLE; + if (weight == 0) + break; + } + + /* Now apply exp. */ + exp -= GMP_NUMB_BITS; + if (exp > 0) + { + weight = 2.0; + uexp = exp; + } + else + { + weight = 0.5; + uexp = NEG_CAST (unsigned long, exp); + } +#if 1 + /* Square-and-multiply exponentiation. */ + if (uexp & 1) + d *= weight; + while (uexp >>= 1) + { + weight *= weight; + if (uexp & 1) + d *= weight; + } +#else + /* Plain exponentiation. */ + while (uexp > 0) + { + d *= weight; + uexp--; + } +#endif + + return sign >= 0 ? d : -d; + } +#endif +} diff --git a/gmp-6.3.0/mpn/generic/get_str.c b/gmp-6.3.0/mpn/generic/get_str.c new file mode 100644 index 0000000..19cc581 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/get_str.c @@ -0,0 +1,451 @@ +/* mpn_get_str -- Convert {UP,USIZE} to a base BASE string in STR. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE, EXCEPT mpn_get_str, ARE INTERNAL WITH MUTABLE + INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. + IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A + FUTURE GNU MP RELEASE. + +Copyright 1991-2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Conversion of U {up,un} to a string in base b. Internally, we convert to + base B = b^m, the largest power of b that fits a limb. Basic algorithms: + + A) Divide U repeatedly by B, generating a quotient and remainder, until the + quotient becomes zero. The remainders hold the converted digits. Digits + come out from right to left. (Used in mpn_bc_get_str.) + + B) Divide U by b^g, for g such that 1/b <= U/b^g < 1, generating a fraction. + Then develop digits by multiplying the fraction repeatedly by b. Digits + come out from left to right. (Currently not used herein, except for in + code for converting single limbs to individual digits.) + + C) Compute B^1, B^2, B^4, ..., B^s, for s such that B^s is just above + sqrt(U). Then divide U by B^s, generating quotient and remainder. + Recursively convert the quotient, then the remainder, using the + precomputed powers. Digits come out from left to right. (Used in + mpn_dc_get_str.) + + When using algorithm C, algorithm B might be suitable for basecase code, + since the required b^g power will be readily accessible. + + Optimization ideas: + 1. The recursive function of (C) could use less temporary memory. The powtab + allocation could be trimmed with some computation, and the tmp area could + be reduced, or perhaps eliminated if up is reused for both quotient and + remainder (it is currently used just for remainder). + 2. Store the powers of (C) in normalized form, with the normalization count. + Quotients will usually need to be left-shifted before each divide, and + remainders will either need to be left-shifted of right-shifted. + 3. In the code for developing digits from a single limb, we could avoid using + a full umul_ppmm except for the first (or first few) digits, provided base + is even. Subsequent digits can be developed using plain multiplication. + (This saves on register-starved machines (read x86) and on all machines + that generate the upper product half using a separate instruction (alpha, + powerpc, IA-64) or lacks such support altogether (sparc64, hppa64). + 4. Separate mpn_dc_get_str basecase code from code for small conversions. The + former code will have the exact right power readily available in the + powtab parameter for dividing the current number into a fraction. Convert + that using algorithm B. + 5. Completely avoid division. Compute the inverses of the powers now in + powtab instead of the actual powers. + 6. Decrease powtab allocation for even bases. E.g. for base 10 we could save + about 30% (1-log(5)/log(10)). + + Basic structure of (C): + mpn_get_str: + if POW2_P (n) + ... + else + if (un < GET_STR_PRECOMPUTE_THRESHOLD) + mpn_bx_get_str (str, base, up, un); + else + precompute_power_tables + mpn_dc_get_str + + mpn_dc_get_str: + mpn_tdiv_qr + if (qn < GET_STR_DC_THRESHOLD) + mpn_bc_get_str + else + mpn_dc_get_str + if (rn < GET_STR_DC_THRESHOLD) + mpn_bc_get_str + else + mpn_dc_get_str + + + The reason for the two threshold values is the cost of + precompute_power_tables. GET_STR_PRECOMPUTE_THRESHOLD will be + considerably larger than GET_STR_DC_THRESHOLD. */ + + +/* The x86s and m68020 have a quotient and remainder "div" instruction and + gcc recognises an adjacent "/" and "%" can be combined using that. + Elsewhere "/" and "%" are either separate instructions, or separate + libgcc calls (which unfortunately gcc as of version 3.0 doesn't combine). + A multiply and subtract should be faster than a "%" in those cases. */ +#if HAVE_HOST_CPU_FAMILY_x86 \ + || HAVE_HOST_CPU_m68020 \ + || HAVE_HOST_CPU_m68030 \ + || HAVE_HOST_CPU_m68040 \ + || HAVE_HOST_CPU_m68060 \ + || HAVE_HOST_CPU_m68360 /* CPU32 */ +#define udiv_qrnd_unnorm(q,r,n,d) \ + do { \ + mp_limb_t __q = (n) / (d); \ + mp_limb_t __r = (n) % (d); \ + (q) = __q; \ + (r) = __r; \ + } while (0) +#else +#define udiv_qrnd_unnorm(q,r,n,d) \ + do { \ + mp_limb_t __q = (n) / (d); \ + mp_limb_t __r = (n) - __q*(d); \ + (q) = __q; \ + (r) = __r; \ + } while (0) +#endif + + +/* Convert {up,un} to a string in base base, and put the result in str. + Generate len characters, possibly padding with zeros to the left. If len is + zero, generate as many characters as required. Return a pointer immediately + after the last digit of the result string. Complexity is O(un^2); intended + for small conversions. */ +static unsigned char * +mpn_bc_get_str (unsigned char *str, size_t len, + mp_ptr up, mp_size_t un, int base) +{ + mp_limb_t rl, ul; + unsigned char *s; + size_t l; + /* Allocate memory for largest possible string, given that we only get here + for operands with un < GET_STR_PRECOMPUTE_THRESHOLD and that the smallest + base is 3. 7/11 is an approximation to 1/log2(3). */ +#if TUNE_PROGRAM_BUILD +#define BUF_ALLOC (GET_STR_THRESHOLD_LIMIT * GMP_LIMB_BITS * 7 / 11) +#else +#define BUF_ALLOC (GET_STR_PRECOMPUTE_THRESHOLD * GMP_LIMB_BITS * 7 / 11) +#endif + unsigned char buf[BUF_ALLOC]; +#if TUNE_PROGRAM_BUILD + mp_limb_t rp[GET_STR_THRESHOLD_LIMIT]; +#else + mp_limb_t rp[GET_STR_PRECOMPUTE_THRESHOLD]; +#endif + + if (base == 10) + { + /* Special case code for base==10 so that the compiler has a chance to + optimize things. */ + + MPN_COPY (rp + 1, up, un); + + s = buf + BUF_ALLOC; + while (un > 1) + { + int i; + mp_limb_t frac, digit; + MPN_DIVREM_OR_PREINV_DIVREM_1 (rp, (mp_size_t) 1, rp + 1, un, + MP_BASES_BIG_BASE_10, + MP_BASES_BIG_BASE_INVERTED_10, + MP_BASES_NORMALIZATION_STEPS_10); + un -= rp[un] == 0; + frac = (rp[0] + 1) << GMP_NAIL_BITS; + s -= MP_BASES_CHARS_PER_LIMB_10; +#if HAVE_HOST_CPU_FAMILY_x86 + /* The code below turns out to be a bit slower for x86 using gcc. + Use plain code. */ + i = MP_BASES_CHARS_PER_LIMB_10; + do + { + umul_ppmm (digit, frac, frac, 10); + *s++ = digit; + } + while (--i); +#else + /* Use the fact that 10 in binary is 1010, with the lowest bit 0. + After a few umul_ppmm, we will have accumulated enough low zeros + to use a plain multiply. */ + if (MP_BASES_NORMALIZATION_STEPS_10 == 0) + { + umul_ppmm (digit, frac, frac, 10); + *s++ = digit; + } + if (MP_BASES_NORMALIZATION_STEPS_10 <= 1) + { + umul_ppmm (digit, frac, frac, 10); + *s++ = digit; + } + if (MP_BASES_NORMALIZATION_STEPS_10 <= 2) + { + umul_ppmm (digit, frac, frac, 10); + *s++ = digit; + } + if (MP_BASES_NORMALIZATION_STEPS_10 <= 3) + { + umul_ppmm (digit, frac, frac, 10); + *s++ = digit; + } + i = (MP_BASES_CHARS_PER_LIMB_10 - ((MP_BASES_NORMALIZATION_STEPS_10 < 4) + ? (4-MP_BASES_NORMALIZATION_STEPS_10) + : 0)); + frac = (frac + 0xf) >> 4; + do + { + frac *= 10; + digit = frac >> (GMP_LIMB_BITS - 4); + *s++ = digit; + frac &= (~(mp_limb_t) 0) >> 4; + } + while (--i); +#endif + s -= MP_BASES_CHARS_PER_LIMB_10; + } + + ul = rp[1]; + while (ul != 0) + { + udiv_qrnd_unnorm (ul, rl, ul, 10); + *--s = rl; + } + } + else /* not base 10 */ + { + unsigned chars_per_limb; + mp_limb_t big_base, big_base_inverted; + unsigned normalization_steps; + + chars_per_limb = mp_bases[base].chars_per_limb; + big_base = mp_bases[base].big_base; + big_base_inverted = mp_bases[base].big_base_inverted; + count_leading_zeros (normalization_steps, big_base); + + MPN_COPY (rp + 1, up, un); + + s = buf + BUF_ALLOC; + while (un > 1) + { + int i; + mp_limb_t frac; + MPN_DIVREM_OR_PREINV_DIVREM_1 (rp, (mp_size_t) 1, rp + 1, un, + big_base, big_base_inverted, + normalization_steps); + un -= rp[un] == 0; + frac = (rp[0] + 1) << GMP_NAIL_BITS; + s -= chars_per_limb; + i = chars_per_limb; + do + { + mp_limb_t digit; + umul_ppmm (digit, frac, frac, base); + *s++ = digit; + } + while (--i); + s -= chars_per_limb; + } + + ul = rp[1]; + while (ul != 0) + { + udiv_qrnd_unnorm (ul, rl, ul, base); + *--s = rl; + } + } + + l = buf + BUF_ALLOC - s; + while (l < len) + { + *str++ = 0; + len--; + } + while (l != 0) + { + *str++ = *s++; + l--; + } + return str; +} + + +/* Convert {UP,UN} to a string with a base as represented in POWTAB, and put + the string in STR. Generate LEN characters, possibly padding with zeros to + the left. If LEN is zero, generate as many characters as required. + Return a pointer immediately after the last digit of the result string. + This uses divide-and-conquer and is intended for large conversions. */ +static unsigned char * +mpn_dc_get_str (unsigned char *str, size_t len, + mp_ptr up, mp_size_t un, + const powers_t *powtab, mp_ptr tmp) +{ + if (BELOW_THRESHOLD (un, GET_STR_DC_THRESHOLD)) + { + if (un != 0) + str = mpn_bc_get_str (str, len, up, un, powtab->base); + else + { + while (len != 0) + { + *str++ = 0; + len--; + } + } + } + else + { + mp_ptr pwp, qp, rp; + mp_size_t pwn, qn; + mp_size_t sn; + + pwp = powtab->p; + pwn = powtab->n; + sn = powtab->shift; + + if (un < pwn + sn || (un == pwn + sn && mpn_cmp (up + sn, pwp, un - sn) < 0)) + { + str = mpn_dc_get_str (str, len, up, un, powtab - 1, tmp); + } + else + { + qp = tmp; /* (un - pwn + 1) limbs for qp */ + rp = up; /* pwn limbs for rp; overwrite up area */ + + mpn_tdiv_qr (qp, rp + sn, 0L, up + sn, un - sn, pwp, pwn); + qn = un - sn - pwn; qn += qp[qn] != 0; /* quotient size */ + + ASSERT (qn < pwn + sn || (qn == pwn + sn && mpn_cmp (qp + sn, pwp, pwn) < 0)); + + if (len != 0) + len = len - powtab->digits_in_base; + + str = mpn_dc_get_str (str, len, qp, qn, powtab - 1, tmp + qn); + str = mpn_dc_get_str (str, powtab->digits_in_base, rp, pwn + sn, powtab - 1, tmp); + } + } + return str; +} + +/* There are no leading zeros on the digits generated at str, but that's not + currently a documented feature. The current mpz_out_str and mpz_get_str + rely on it. */ + +size_t +mpn_get_str (unsigned char *str, int base, mp_ptr up, mp_size_t un) +{ + mp_ptr powtab_mem; + powers_t powtab[GMP_LIMB_BITS]; + int pi; + size_t out_len; + mp_ptr tmp; + TMP_DECL; + + /* Special case zero, as the code below doesn't handle it. */ + if (un == 0) + { + str[0] = 0; + return 1; + } + + if (POW2_P (base)) + { + /* The base is a power of 2. Convert from most significant end. */ + mp_limb_t n1, n0; + int bits_per_digit = mp_bases[base].big_base; + int cnt; + int bit_pos; + mp_size_t i; + unsigned char *s = str; + mp_bitcnt_t bits; + + n1 = up[un - 1]; + count_leading_zeros (cnt, n1); + + /* BIT_POS should be R when input ends in least significant nibble, + R + bits_per_digit * n when input ends in nth least significant + nibble. */ + + bits = (mp_bitcnt_t) GMP_NUMB_BITS * un - cnt + GMP_NAIL_BITS; + cnt = bits % bits_per_digit; + if (cnt != 0) + bits += bits_per_digit - cnt; + bit_pos = bits - (mp_bitcnt_t) (un - 1) * GMP_NUMB_BITS; + + /* Fast loop for bit output. */ + i = un - 1; + for (;;) + { + bit_pos -= bits_per_digit; + while (bit_pos >= 0) + { + *s++ = (n1 >> bit_pos) & ((1 << bits_per_digit) - 1); + bit_pos -= bits_per_digit; + } + i--; + if (i < 0) + break; + n0 = (n1 << -bit_pos) & ((1 << bits_per_digit) - 1); + n1 = up[i]; + bit_pos += GMP_NUMB_BITS; + *s++ = n0 | (n1 >> bit_pos); + } + + return s - str; + } + + /* General case. The base is not a power of 2. */ + + if (BELOW_THRESHOLD (un, GET_STR_PRECOMPUTE_THRESHOLD)) + return mpn_bc_get_str (str, (size_t) 0, up, un, base) - str; + + TMP_MARK; + + /* Allocate one large block for the powers of big_base. */ + powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un)); + + /* Compute a table of powers, were the largest power is >= sqrt(U). */ + size_t ndig; + mp_size_t xn; + DIGITS_IN_BASE_PER_LIMB (ndig, un, base); + xn = 1 + ndig / mp_bases[base].chars_per_limb; /* FIXME: scalar integer division */ + + pi = 1 + mpn_compute_powtab (powtab, powtab_mem, xn, base); + + /* Using our precomputed powers, now in powtab[], convert our number. */ + tmp = TMP_BALLOC_LIMBS (mpn_dc_get_str_itch (un)); + out_len = mpn_dc_get_str (str, 0, up, un, powtab + (pi - 1), tmp) - str; + TMP_FREE; + + return out_len; +} diff --git a/gmp-6.3.0/mpn/generic/gmp-mparam.h b/gmp-6.3.0/mpn/generic/gmp-mparam.h new file mode 100644 index 0000000..7dc057a --- /dev/null +++ b/gmp-6.3.0/mpn/generic/gmp-mparam.h @@ -0,0 +1,33 @@ +/* Generic C gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright 1991, 1993, 1994, 2000 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* Values for GMP_LIMB_BITS etc will be determined by ./configure and put + in config.h. */ diff --git a/gmp-6.3.0/mpn/generic/hgcd.c b/gmp-6.3.0/mpn/generic/hgcd.c new file mode 100644 index 0000000..e3e9c66 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd.c @@ -0,0 +1,182 @@ +/* hgcd.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003-2005, 2008, 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* Size analysis for hgcd: + + For the recursive calls, we have n1 <= ceil(n / 2). Then the + storage need is determined by the storage for the recursive call + computing M1, and hgcd_matrix_adjust and hgcd_matrix_mul calls that use M1 + (after this, the storage needed for M1 can be recycled). + + Let S(r) denote the required storage. For M1 we need 4 * (ceil(n1/2) + 1) + = 4 * (ceil(n/4) + 1), for the hgcd_matrix_adjust call, we need n + 2, + and for the hgcd_matrix_mul, we may need 3 ceil(n/2) + 8. In total, + 4 * ceil(n/4) + 3 ceil(n/2) + 12 <= 10 ceil(n/4) + 12. + + For the recursive call, we need S(n1) = S(ceil(n/2)). + + S(n) <= 10*ceil(n/4) + 12 + S(ceil(n/2)) + <= 10*(ceil(n/4) + ... + ceil(n/2^(1+k))) + 12k + S(ceil(n/2^k)) + <= 10*(2 ceil(n/4) + k) + 12k + S(ceil(n/2^k)) + <= 20 ceil(n/4) + 22k + S(ceil(n/2^k)) +*/ + +mp_size_t +mpn_hgcd_itch (mp_size_t n) +{ + unsigned k; + int count; + mp_size_t nscaled; + + if (BELOW_THRESHOLD (n, HGCD_THRESHOLD)) + return n; + + /* Get the recursion depth. */ + nscaled = (n - 1) / (HGCD_THRESHOLD - 1); + count_leading_zeros (count, nscaled); + k = GMP_LIMB_BITS - count; + + return 20 * ((n+3) / 4) + 22 * k + HGCD_THRESHOLD; +} + +/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M + with elements of size at most (n+1)/2 - 1. Returns new size of a, + b, or zero if no reduction is possible. */ + +mp_size_t +mpn_hgcd (mp_ptr ap, mp_ptr bp, mp_size_t n, + struct hgcd_matrix *M, mp_ptr tp) +{ + mp_size_t s = n/2 + 1; + + mp_size_t nn; + int success = 0; + + if (n <= s) + /* Happens when n <= 2, a fairly uninteresting case but exercised + by the random inputs of the testsuite. */ + return 0; + + ASSERT ((ap[n-1] | bp[n-1]) > 0); + + ASSERT ((n+1)/2 - 1 < M->alloc); + + if (ABOVE_THRESHOLD (n, HGCD_THRESHOLD)) + { + mp_size_t n2 = (3*n)/4 + 1; + mp_size_t p = n/2; + + nn = mpn_hgcd_reduce (M, ap, bp, n, p, tp); + if (nn) + { + n = nn; + success = 1; + } + + /* NOTE: It appears this loop never runs more than once (at + least when not recursing to hgcd_appr). */ + while (n > n2) + { + /* Needs n + 1 storage */ + nn = mpn_hgcd_step (n, ap, bp, s, M, tp); + if (!nn) + return success ? n : 0; + + n = nn; + success = 1; + } + + if (n > s + 2) + { + struct hgcd_matrix M1; + mp_size_t scratch; + + p = 2*s - n + 1; + scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p); + + mpn_hgcd_matrix_init(&M1, n - p, tp); + + /* FIXME: Should use hgcd_reduce, but that may require more + scratch space, which requires review. */ + + nn = mpn_hgcd (ap + p, bp + p, n - p, &M1, tp + scratch); + if (nn > 0) + { + /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */ + ASSERT (M->n + 2 >= M1.n); + + /* Furthermore, assume M ends with a quotient (1, q; 0, 1), + then either q or q + 1 is a correct quotient, and M1 will + start with either (1, 0; 1, 1) or (2, 1; 1, 1). This + rules out the case that the size of M * M1 is much + smaller than the expected M->n + M1->n. */ + + ASSERT (M->n + M1.n < M->alloc); + + /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1) + = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */ + n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch); + + /* We need a bound for of M->n + M1.n. Let n be the original + input size. Then + + ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2 + + and it follows that + + M.n + M1.n <= ceil(n/2) + 1 + + Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the + amount of needed scratch space. */ + mpn_hgcd_matrix_mul (M, &M1, tp + scratch); + success = 1; + } + } + } + + for (;;) + { + /* Needs s+3 < n */ + nn = mpn_hgcd_step (n, ap, bp, s, M, tp); + if (!nn) + return success ? n : 0; + + n = nn; + success = 1; + } +} diff --git a/gmp-6.3.0/mpn/generic/hgcd2-div.h b/gmp-6.3.0/mpn/generic/hgcd2-div.h new file mode 100644 index 0000000..45ba453 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd2-div.h @@ -0,0 +1,504 @@ +/* hgcd2-div.h + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019, 2020 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef HGCD2_DIV1_METHOD +#define HGCD2_DIV1_METHOD 3 +#endif + +#ifndef HGCD2_DIV2_METHOD +#define HGCD2_DIV2_METHOD 2 +#endif + +#if HAVE_NATIVE_mpn_div_11 + +#define div1 mpn_div_11 +/* Single-limb division optimized for small quotients. + Returned value holds d0 = r, d1 = q. */ +mp_double_limb_t div1 (mp_limb_t, mp_limb_t); + +#elif HGCD2_DIV1_METHOD == 1 + +static inline mp_double_limb_t +div1 (mp_limb_t n0, mp_limb_t d0) +{ + mp_double_limb_t res; + res.d1 = n0 / d0; + res.d0 = n0 - res.d1 * d0; + + return res; +} + +#elif HGCD2_DIV1_METHOD == 2 + +static mp_double_limb_t +div1 (mp_limb_t n0, mp_limb_t d0) +{ + mp_double_limb_t res; + int ncnt, dcnt, cnt; + mp_limb_t q; + mp_limb_t mask; + + ASSERT (n0 >= d0); + + count_leading_zeros (ncnt, n0); + count_leading_zeros (dcnt, d0); + cnt = dcnt - ncnt; + + d0 <<= cnt; + + q = -(mp_limb_t) (n0 >= d0); + n0 -= d0 & q; + d0 >>= 1; + q = -q; + + while (--cnt >= 0) + { + mask = -(mp_limb_t) (n0 >= d0); + n0 -= d0 & mask; + d0 >>= 1; + q = (q << 1) - mask; + } + + res.d0 = n0; + res.d1 = q; + return res; +} + +#elif HGCD2_DIV1_METHOD == 3 + +static inline mp_double_limb_t +div1 (mp_limb_t n0, mp_limb_t d0) +{ + mp_double_limb_t res; + if (UNLIKELY ((d0 >> (GMP_LIMB_BITS - 3)) != 0) + || UNLIKELY (n0 >= (d0 << 3))) + { + res.d1 = n0 / d0; + res.d0 = n0 - res.d1 * d0; + } + else + { + mp_limb_t q, mask; + + d0 <<= 2; + + mask = -(mp_limb_t) (n0 >= d0); + n0 -= d0 & mask; + q = 4 & mask; + + d0 >>= 1; + mask = -(mp_limb_t) (n0 >= d0); + n0 -= d0 & mask; + q += 2 & mask; + + d0 >>= 1; + mask = -(mp_limb_t) (n0 >= d0); + n0 -= d0 & mask; + q -= mask; + + res.d0 = n0; + res.d1 = q; + } + return res; +} + +#elif HGCD2_DIV1_METHOD == 4 + +/* Table quotients. We extract the NBITS most significant bits of the + numerator limb, and the corresponding bits from the divisor limb, and use + these to form an index into the table. This method is probably only useful + for short pipelines with slow multiplication. + + Possible improvements: + + * Perhaps extract the highest NBITS of the divisor instead of the same bits + as from the numerator. That would require another count_leading_zeros, + and a post-multiply shift of the quotient. + + * Compress tables? Their values are tiny, and there are lots of zero + entries (which are never used). + + * Round the table entries more cleverly? +*/ + +#ifndef NBITS +#define NBITS 5 +#endif + +#if NBITS == 5 +/* This needs full division about 13.2% of the time. */ +static const unsigned char tab[512] = { +17, 9, 5,4,3,2,2,2,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +18, 9, 6,4,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +19,10, 6,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0, +20,10, 6,5,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0, +21,11, 7,5,4,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0, +22,11, 7,5,4,3,3,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0, +23,12, 7,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0, +24,12, 8,6,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, +25,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0, +26,13, 8,6,5,4,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, +27,14, 9,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, +28,14, 9,7,5,4,3,3,3,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0, +29,15,10,7,5,4,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0, +30,15,10,7,6,5,4,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0, +31,16,10,7,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0, +32,16,11,8,6,5,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +}; +#elif NBITS == 6 +/* This needs full division about 9.8% of the time. */ +static const unsigned char tab[2048] = { +33,17,11, 8, 6, 5,4,4,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 0, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +34,17,11, 8, 6, 5,4,4,3,3,3,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 0, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +35,18,12, 9, 7, 5,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +36,18,12, 9, 7, 6,5,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +37,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +38,19,13, 9, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +39,20,13,10, 7, 6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +40,20,14,10, 8, 6,5,5,4,3,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +41,21,14,10, 8, 6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +42,21,14,10, 8, 7,6,5,4,4,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +43,22,15,11, 8, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +44,22,15,11, 9, 7,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +45,23,15,11, 9, 7,6,5,5,4,4,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +46,23,16,11, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +47,24,16,12, 9, 7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +48,24,16,12, 9, 8,6,6,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +49,25,17,12,10, 8,7,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +50,25,17,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, +51,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0, +52,26,18,13,10, 8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0, +53,27,18,13,10, 9,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0, +54,27,19,14,11, 9,7,6,6,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0, +55,28,19,14,11, 9,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0, +56,28,19,14,11, 9,8,7,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0, +57,29,20,14,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0, +58,29,20,15,11, 9,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,1,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0, +59,30,20,15,12,10,8,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0, +60,30,21,15,12,10,8,7,6,6,5,5,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0, +61,31,21,15,12,10,8,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0, +62,31,22,16,12,10,9,7,6,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0, +63,32,22,16,13,10,9,7,7,6,5,5,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0, +64,32,22,16,13,10,9,8,7,6,5,5,4,4,4,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,1, + 1, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +}; +#else +#error No table for provided NBITS +#endif + +/* Doing tabp with a #define makes compiler warnings about pointing outside an + object go away. We used to define this as a variable. It is not clear if + e.g. (vector[100] - 10) + 10 is well- defined as per the C standard; + (vector[100] + 10) - 10 surely is and there is no sequence point so the + expressions should be equivalent. To make this safe, we might want to + define tabp as a macro with the index as an argument. Depending on the + platform, relocs might allow for assembly-time or linker-time resolution to + take place. */ +#define tabp (tab - (1 << (NBITS - 1) << NBITS)) + +static inline mp_double_limb_t +div1 (mp_limb_t n0, mp_limb_t d0) +{ + int ncnt; + size_t nbi, dbi; + mp_limb_t q0; + mp_limb_t r0; + mp_limb_t mask; + mp_double_limb_t res; + + ASSERT (n0 >= d0); /* Actually only msb position is critical. */ + + count_leading_zeros (ncnt, n0); + nbi = n0 << ncnt >> (GMP_LIMB_BITS - NBITS); + dbi = d0 << ncnt >> (GMP_LIMB_BITS - NBITS); + + q0 = tabp[(nbi << NBITS) + dbi]; + r0 = n0 - q0 * d0; + mask = -(mp_limb_t) (r0 >= d0); + q0 -= mask; + r0 -= d0 & mask; + + if (UNLIKELY (r0 >= d0)) + { + q0 = n0 / d0; + r0 = n0 - q0 * d0; + } + + res.d1 = q0; + res.d0 = r0; + return res; +} + +#elif HGCD2_DIV1_METHOD == 5 + +/* Table inverses of divisors. We don't bother with suppressing the msb from + the tables. We index with the NBITS most significant divisor bits, + including the always-set highest bit, but use addressing trickery via tabp + to suppress it. + + Possible improvements: + + * Do first multiply using 32-bit operations on 64-bit computers. At least + on most Arm64 cores, that uses 3 times less resources. It also saves on + many x86-64 processors. +*/ + +#ifndef NBITS +#define NBITS 7 +#endif + +#if NBITS == 5 +/* This needs full division about 1.63% of the time. */ +static const unsigned char tab[16] = { + 63, 59, 55, 52, 50, 47, 45, 43, 41, 39, 38, 36, 35, 34, 33, 32 +}; +#elif NBITS == 6 +/* This needs full division about 0.93% of the time. */ +static const unsigned char tab[32] = { +127,123,119,116,112,109,106,104,101, 98, 96, 94, 92, 90, 88, 86, + 84, 82, 80, 79, 77, 76, 74, 73, 72, 70, 69, 68, 67, 66, 65, 64 +}; +#elif NBITS == 7 +/* This needs full division about 0.49% of the time. */ +static const unsigned char tab[64] = { +255,251,247,243,239,236,233,229,226,223,220,217,214,211,209,206, +203,201,198,196,194,191,189,187,185,183,181,179,177,175,173,171, +169,167,166,164,162,161,159,158,156,155,153,152,150,149,147,146, +145,143,142,141,140,139,137,136,135,134,133,132,131,130,129,128 +}; +#elif NBITS == 8 +/* This needs full division about 0.26% of the time. */ +static const unsigned short tab[128] = { +511,507,503,499,495,491,488,484,480,477,473,470,467,463,460,457, +454,450,447,444,441,438,435,433,430,427,424,421,419,416,413,411, +408,406,403,401,398,396,393,391,389,386,384,382,380,377,375,373, +371,369,367,365,363,361,359,357,355,353,351,349,347,345,343,342, +340,338,336,335,333,331,329,328,326,325,323,321,320,318,317,315, +314,312,311,309,308,306,305,303,302,301,299,298,296,295,294,292, +291,290,288,287,286,285,283,282,281,280,279,277,276,275,274,273, +272,270,269,268,267,266,265,264,263,262,261,260,259,258,257,256 +}; +#else +#error No table for provided NBITS +#endif + +/* Doing tabp with a #define makes compiler warnings about pointing outside an + object go away. We used to define this as a variable. It is not clear if + e.g. (vector[100] - 10) + 10 is well- defined as per the C standard; + (vector[100] + 10) - 10 surely is and there is no sequence point so the + expressions should be equivalent. To make this safe, we might want to + define tabp as a macro with the index as an argument. Depending on the + platform, relocs might allow for assembly-time or linker-time resolution to + take place. */ +#define tabp (tab - (1 << (NBITS - 1))) + +static inline mp_double_limb_t +div1 (mp_limb_t n0, mp_limb_t d0) +{ + int ncnt, dcnt; + size_t dbi; + mp_limb_t inv; + mp_limb_t q0; + mp_limb_t r0; + mp_limb_t mask; + mp_double_limb_t res; + + count_leading_zeros (ncnt, n0); + count_leading_zeros (dcnt, d0); + + dbi = d0 << dcnt >> (GMP_LIMB_BITS - NBITS); + inv = tabp[dbi]; + q0 = ((n0 << ncnt) >> (NBITS + 1)) * inv >> (GMP_LIMB_BITS - 1 + ncnt - dcnt); + r0 = n0 - q0 * d0; + mask = -(mp_limb_t) (r0 >= d0); + q0 -= mask; + r0 -= d0 & mask; + + if (UNLIKELY (r0 >= d0)) + { + q0 = n0 / d0; + r0 = n0 - q0 * d0; + } + + res.d1 = q0; + res.d0 = r0; + return res; +} + +#else +#error Unknown HGCD2_DIV1_METHOD +#endif + +#if HAVE_NATIVE_mpn_div_22 + +#define div2 mpn_div_22 +/* Two-limb division optimized for small quotients. */ +mp_limb_t div2 (mp_ptr, mp_limb_t, mp_limb_t, mp_limb_t, mp_limb_t); + +#elif HGCD2_DIV2_METHOD == 1 + +static mp_limb_t +div2 (mp_ptr rp, + mp_limb_t n1, mp_limb_t n0, + mp_limb_t d1, mp_limb_t d0) +{ + mp_double_limb_t rq = div1 (n1, d1); + if (UNLIKELY (rq.d1 > d1)) + { + mp_limb_t n2, q, t1, t0; + int c; + + /* Normalize */ + count_leading_zeros (c, d1); + ASSERT (c > 0); + + n2 = n1 >> (GMP_LIMB_BITS - c); + n1 = (n1 << c) | (n0 >> (GMP_LIMB_BITS - c)); + n0 <<= c; + d1 = (d1 << c) | (d0 >> (GMP_LIMB_BITS - c)); + d0 <<= c; + + udiv_qrnnd (q, n1, n2, n1, d1); + umul_ppmm (t1, t0, q, d0); + if (t1 > n1 || (t1 == n1 && t0 > n0)) + { + ASSERT (q > 0); + q--; + sub_ddmmss (t1, t0, t1, t0, d1, d0); + } + sub_ddmmss (n1, n0, n1, n0, t1, t0); + + /* Undo normalization */ + rp[0] = (n0 >> c) | (n1 << (GMP_LIMB_BITS - c)); + rp[1] = n1 >> c; + + return q; + } + else + { + mp_limb_t q, t1, t0; + n1 = rq.d0; + q = rq.d1; + umul_ppmm (t1, t0, q, d0); + if (UNLIKELY (t1 >= n1) && (t1 > n1 || t0 > n0)) + { + ASSERT (q > 0); + q--; + sub_ddmmss (t1, t0, t1, t0, d1, d0); + } + sub_ddmmss (rp[1], rp[0], n1, n0, t1, t0); + return q; + } +} + +#elif HGCD2_DIV2_METHOD == 2 + +/* Bit-wise div2. Relies on fast count_leading_zeros. */ +static mp_limb_t +div2 (mp_ptr rp, + mp_limb_t n1, mp_limb_t n0, + mp_limb_t d1, mp_limb_t d0) +{ + mp_limb_t q = 0; + int ncnt; + int dcnt; + + count_leading_zeros (ncnt, n1); + count_leading_zeros (dcnt, d1); + dcnt -= ncnt; + + d1 = (d1 << dcnt) + (d0 >> 1 >> (GMP_LIMB_BITS - 1 - dcnt)); + d0 <<= dcnt; + + do + { + mp_limb_t mask; + q <<= 1; + if (UNLIKELY (n1 == d1)) + mask = -(n0 >= d0); + else + mask = -(n1 > d1); + + q -= mask; + + sub_ddmmss (n1, n0, n1, n0, mask & d1, mask & d0); + + d0 = (d1 << (GMP_LIMB_BITS - 1)) | (d0 >> 1); + d1 = d1 >> 1; + } + while (dcnt--); + + rp[0] = n0; + rp[1] = n1; + + return q; +} +#else +#error Unknown HGCD2_DIV2_METHOD +#endif diff --git a/gmp-6.3.0/mpn/generic/hgcd2.c b/gmp-6.3.0/mpn/generic/hgcd2.c new file mode 100644 index 0000000..43d4d48 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd2.c @@ -0,0 +1,283 @@ +/* hgcd2.c + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1996, 1998, 2000-2004, 2008, 2012, 2019 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/generic/hgcd2-div.h" + +#if GMP_NAIL_BITS != 0 +#error Nails not implemented +#endif + +/* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs + matrix M. Returns 1 if we make progress, i.e. can perform at least + one subtraction. Otherwise returns zero. */ + +/* FIXME: Possible optimizations: + + The div2 function starts with checking the most significant bit of + the numerator. We can maintained normalized operands here, call + hgcd with normalized operands only, which should make the code + simpler and possibly faster. + + Experiment with table lookups on the most significant bits. + + This function is also a candidate for assembler implementation. +*/ +int +mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl, + struct hgcd_matrix1 *M) +{ + mp_limb_t u00, u01, u10, u11; + + if (ah < 2 || bh < 2) + return 0; + + if (ah > bh || (ah == bh && al > bl)) + { + sub_ddmmss (ah, al, ah, al, bh, bl); + if (ah < 2) + return 0; + + u00 = u01 = u11 = 1; + u10 = 0; + } + else + { + sub_ddmmss (bh, bl, bh, bl, ah, al); + if (bh < 2) + return 0; + + u00 = u10 = u11 = 1; + u01 = 0; + } + + if (ah < bh) + goto subtract_a; + + for (;;) + { + ASSERT (ah >= bh); + if (ah == bh) + goto done; + + if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2))) + { + ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2)); + bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2)); + + break; + } + + /* Subtract a -= q b, and multiply M from the right by (1 q ; 0 + 1), affecting the second column of M. */ + ASSERT (ah > bh); + sub_ddmmss (ah, al, ah, al, bh, bl); + + if (ah < 2) + goto done; + + if (ah <= bh) + { + /* Use q = 1 */ + u01 += u00; + u11 += u10; + } + else + { + mp_limb_t r[2]; + mp_limb_t q = div2 (r, ah, al, bh, bl); + al = r[0]; ah = r[1]; + if (ah < 2) + { + /* A is too small, but q is correct. */ + u01 += q * u00; + u11 += q * u10; + goto done; + } + q++; + u01 += q * u00; + u11 += q * u10; + } + subtract_a: + ASSERT (bh >= ah); + if (ah == bh) + goto done; + + if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2))) + { + ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2)); + bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2)); + + goto subtract_a1; + } + + /* Subtract b -= q a, and multiply M from the right by (1 0 ; q + 1), affecting the first column of M. */ + sub_ddmmss (bh, bl, bh, bl, ah, al); + + if (bh < 2) + goto done; + + if (bh <= ah) + { + /* Use q = 1 */ + u00 += u01; + u10 += u11; + } + else + { + mp_limb_t r[2]; + mp_limb_t q = div2 (r, bh, bl, ah, al); + bl = r[0]; bh = r[1]; + if (bh < 2) + { + /* B is too small, but q is correct. */ + u00 += q * u01; + u10 += q * u11; + goto done; + } + q++; + u00 += q * u01; + u10 += q * u11; + } + } + + /* NOTE: Since we discard the least significant half limb, we don't get a + truly maximal M (corresponding to |a - b| < 2^{GMP_LIMB_BITS +1}). */ + /* Single precision loop */ + for (;;) + { + ASSERT (ah >= bh); + + ah -= bh; + if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1))) + break; + + if (ah <= bh) + { + /* Use q = 1 */ + u01 += u00; + u11 += u10; + } + else + { + mp_double_limb_t rq = div1 (ah, bh); + mp_limb_t q = rq.d1; + ah = rq.d0; + + if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1))) + { + /* A is too small, but q is correct. */ + u01 += q * u00; + u11 += q * u10; + break; + } + q++; + u01 += q * u00; + u11 += q * u10; + } + subtract_a1: + ASSERT (bh >= ah); + + bh -= ah; + if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1))) + break; + + if (bh <= ah) + { + /* Use q = 1 */ + u00 += u01; + u10 += u11; + } + else + { + mp_double_limb_t rq = div1 (bh, ah); + mp_limb_t q = rq.d1; + bh = rq.d0; + + if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1))) + { + /* B is too small, but q is correct. */ + u00 += q * u01; + u10 += q * u11; + break; + } + q++; + u00 += q * u01; + u10 += q * u11; + } + } + + done: + M->u[0][0] = u00; M->u[0][1] = u01; + M->u[1][0] = u10; M->u[1][1] = u11; + + return 1; +} + +/* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must + * have space for n + 1 limbs. Uses three buffers to avoid a copy*/ +mp_size_t +mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M, + mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n) +{ + mp_limb_t ah, bh; + + /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as + + r = u00 * a + r += u10 * b + b *= u11 + b += u01 * a + */ + +#if HAVE_NATIVE_mpn_addaddmul_1msb0 + ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]); + bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]); +#else + ah = mpn_mul_1 (rp, ap, n, M->u[0][0]); + ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]); + + bh = mpn_mul_1 (bp, bp, n, M->u[1][1]); + bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]); +#endif + rp[n] = ah; + bp[n] = bh; + + n += (ah | bh) > 0; + return n; +} diff --git a/gmp-6.3.0/mpn/generic/hgcd2_jacobi.c b/gmp-6.3.0/mpn/generic/hgcd2_jacobi.c new file mode 100644 index 0000000..95d4af1 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd2_jacobi.c @@ -0,0 +1,251 @@ +/* hgcd2_jacobi.c + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1996, 1998, 2000-2004, 2008, 2011, 2020 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#include "mpn/generic/hgcd2-div.h" + +#if GMP_NAIL_BITS != 0 +#error Nails not implemented +#endif + +int +mpn_hgcd2_jacobi (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl, + struct hgcd_matrix1 *M, unsigned *bitsp) +{ + mp_limb_t u00, u01, u10, u11; + unsigned bits = *bitsp; + + if (ah < 2 || bh < 2) + return 0; + + if (ah > bh || (ah == bh && al > bl)) + { + sub_ddmmss (ah, al, ah, al, bh, bl); + if (ah < 2) + return 0; + + u00 = u01 = u11 = 1; + u10 = 0; + bits = mpn_jacobi_update (bits, 1, 1); + } + else + { + sub_ddmmss (bh, bl, bh, bl, ah, al); + if (bh < 2) + return 0; + + u00 = u10 = u11 = 1; + u01 = 0; + bits = mpn_jacobi_update (bits, 0, 1); + } + + if (ah < bh) + goto subtract_a; + + for (;;) + { + ASSERT (ah >= bh); + if (ah == bh) + goto done; + + if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2))) + { + ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2)); + bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2)); + + break; + } + + /* Subtract a -= q b, and multiply M from the right by (1 q ; 0 + 1), affecting the second column of M. */ + ASSERT (ah > bh); + sub_ddmmss (ah, al, ah, al, bh, bl); + + if (ah < 2) + goto done; + + if (ah <= bh) + { + /* Use q = 1 */ + u01 += u00; + u11 += u10; + bits = mpn_jacobi_update (bits, 1, 1); + } + else + { + mp_limb_t r[2]; + mp_limb_t q = div2 (r, ah, al, bh, bl); + al = r[0]; ah = r[1]; + if (ah < 2) + { + /* A is too small, but q is correct. */ + u01 += q * u00; + u11 += q * u10; + bits = mpn_jacobi_update (bits, 1, q & 3); + goto done; + } + q++; + u01 += q * u00; + u11 += q * u10; + bits = mpn_jacobi_update (bits, 1, q & 3); + } + subtract_a: + ASSERT (bh >= ah); + if (ah == bh) + goto done; + + if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2))) + { + ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2)); + bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2)); + + goto subtract_a1; + } + + /* Subtract b -= q a, and multiply M from the right by (1 0 ; q + 1), affecting the first column of M. */ + sub_ddmmss (bh, bl, bh, bl, ah, al); + + if (bh < 2) + goto done; + + if (bh <= ah) + { + /* Use q = 1 */ + u00 += u01; + u10 += u11; + bits = mpn_jacobi_update (bits, 0, 1); + } + else + { + mp_limb_t r[2]; + mp_limb_t q = div2 (r, bh, bl, ah, al); + bl = r[0]; bh = r[1]; + if (bh < 2) + { + /* B is too small, but q is correct. */ + u00 += q * u01; + u10 += q * u11; + bits = mpn_jacobi_update (bits, 0, q & 3); + goto done; + } + q++; + u00 += q * u01; + u10 += q * u11; + bits = mpn_jacobi_update (bits, 0, q & 3); + } + } + + /* NOTE: Since we discard the least significant half limb, we don't get a + truly maximal M (corresponding to |a - b| < 2^{GMP_LIMB_BITS +1}). */ + /* Single precision loop */ + for (;;) + { + ASSERT (ah >= bh); + + ah -= bh; + if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1))) + break; + + if (ah <= bh) + { + /* Use q = 1 */ + u01 += u00; + u11 += u10; + bits = mpn_jacobi_update (bits, 1, 1); + } + else + { + mp_double_limb_t rq = div1 (ah, bh); + mp_limb_t q = rq.d1; + ah = rq.d0; + + if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1))) + { + /* A is too small, but q is correct. */ + u01 += q * u00; + u11 += q * u10; + bits = mpn_jacobi_update (bits, 1, q & 3); + break; + } + q++; + u01 += q * u00; + u11 += q * u10; + bits = mpn_jacobi_update (bits, 1, q & 3); + } + subtract_a1: + ASSERT (bh >= ah); + + bh -= ah; + if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1))) + break; + + if (bh <= ah) + { + /* Use q = 1 */ + u00 += u01; + u10 += u11; + bits = mpn_jacobi_update (bits, 0, 1); + } + else + { + mp_double_limb_t rq = div1 (bh, ah); + mp_limb_t q = rq.d1; + bh = rq.d0; + + if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1))) + { + /* B is too small, but q is correct. */ + u00 += q * u01; + u10 += q * u11; + bits = mpn_jacobi_update (bits, 0, q & 3); + break; + } + q++; + u00 += q * u01; + u10 += q * u11; + bits = mpn_jacobi_update (bits, 0, q & 3); + } + } + + done: + M->u[0][0] = u00; M->u[0][1] = u01; + M->u[1][0] = u10; M->u[1][1] = u11; + *bitsp = bits; + + return 1; +} diff --git a/gmp-6.3.0/mpn/generic/hgcd_appr.c b/gmp-6.3.0/mpn/generic/hgcd_appr.c new file mode 100644 index 0000000..bb01738 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd_appr.c @@ -0,0 +1,267 @@ +/* hgcd_appr.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Identical to mpn_hgcd_itch. FIXME: Do we really need to add + HGCD_THRESHOLD at the end? */ +mp_size_t +mpn_hgcd_appr_itch (mp_size_t n) +{ + if (BELOW_THRESHOLD (n, HGCD_APPR_THRESHOLD)) + return n; + else + { + unsigned k; + int count; + mp_size_t nscaled; + + /* Get the recursion depth. */ + nscaled = (n - 1) / (HGCD_APPR_THRESHOLD - 1); + count_leading_zeros (count, nscaled); + k = GMP_LIMB_BITS - count; + + return 20 * ((n+3) / 4) + 22 * k + HGCD_THRESHOLD; + } +} + +/* Destroys inputs. */ +int +mpn_hgcd_appr (mp_ptr ap, mp_ptr bp, mp_size_t n, + struct hgcd_matrix *M, mp_ptr tp) +{ + mp_size_t s; + int success = 0; + + ASSERT (n > 0); + + ASSERT ((ap[n-1] | bp[n-1]) != 0); + + if (n <= 2) + /* Implies s = n. A fairly uninteresting case but exercised by the + random inputs of the testsuite. */ + return 0; + + ASSERT ((n+1)/2 - 1 < M->alloc); + + /* We aim for reduction of to GMP_NUMB_BITS * s bits. But each time + we discard some of the least significant limbs, we must keep one + additional bit to account for the truncation error. We maintain + the GMP_NUMB_BITS * s - extra_bits as the current target size. */ + + s = n/2 + 1; + if (BELOW_THRESHOLD (n, HGCD_APPR_THRESHOLD)) + { + unsigned extra_bits = 0; + + while (n > 2) + { + mp_size_t nn; + + ASSERT (n > s); + ASSERT (n <= 2*s); + + nn = mpn_hgcd_step (n, ap, bp, s, M, tp); + if (!nn) + break; + + n = nn; + success = 1; + + /* We can truncate and discard the lower p bits whenever nbits <= + 2*sbits - p. To account for the truncation error, we must + adjust + + sbits <-- sbits + 1 - p, + + rather than just sbits <-- sbits - p. This adjustment makes + the produced matrix slightly smaller than it could be. */ + + if (GMP_NUMB_BITS * (n + 1) + 2 * extra_bits <= 2*GMP_NUMB_BITS * s) + { + mp_size_t p = (GMP_NUMB_BITS * (2*s - n) - 2*extra_bits) / GMP_NUMB_BITS; + + if (extra_bits == 0) + { + /* We cross a limb boundary and bump s. We can't do that + if the result is that it makes makes min(U, V) + smaller than 2^{GMP_NUMB_BITS} s. */ + if (s + 1 == n + || mpn_zero_p (ap + s + 1, n - s - 1) + || mpn_zero_p (bp + s + 1, n - s - 1)) + continue; + + extra_bits = GMP_NUMB_BITS - 1; + s++; + } + else + { + extra_bits--; + } + + /* Drop the p least significant limbs */ + ap += p; bp += p; n -= p; s -= p; + } + } + + ASSERT (s > 0); + + if (extra_bits > 0) + { + /* We can get here only of we have dropped at least one of the least + significant bits, so we can decrement ap and bp. We can then shift + left extra bits using mpn_rshift. */ + /* NOTE: In the unlikely case that n is large, it would be preferable + to do an initial subdiv step to reduce the size before shifting, + but that would mean duplicating mpn_gcd_subdiv_step with a bit + count rather than a limb count. */ + ap--; bp--; + ap[0] = mpn_rshift (ap+1, ap+1, n, GMP_NUMB_BITS - extra_bits); + bp[0] = mpn_rshift (bp+1, bp+1, n, GMP_NUMB_BITS - extra_bits); + n += (ap[n] | bp[n]) > 0; + + ASSERT (success); + + while (n > 2) + { + mp_size_t nn; + + ASSERT (n > s); + ASSERT (n <= 2*s); + + nn = mpn_hgcd_step (n, ap, bp, s, M, tp); + + if (!nn) + return 1; + + n = nn; + } + } + + if (n == 2) + { + struct hgcd_matrix1 M1; + ASSERT (s == 1); + + if (mpn_hgcd2 (ap[1], ap[0], bp[1], bp[0], &M1)) + { + /* Multiply M <- M * M1 */ + mpn_hgcd_matrix_mul_1 (M, &M1, tp); + success = 1; + } + } + return success; + } + else + { + mp_size_t n2 = (3*n)/4 + 1; + mp_size_t p = n/2; + mp_size_t nn; + + nn = mpn_hgcd_reduce (M, ap, bp, n, p, tp); + if (nn) + { + n = nn; + /* FIXME: Discard some of the low limbs immediately? */ + success = 1; + } + + while (n > n2) + { + mp_size_t nn; + + /* Needs n + 1 storage */ + nn = mpn_hgcd_step (n, ap, bp, s, M, tp); + if (!nn) + return success; + + n = nn; + success = 1; + } + if (n > s + 2) + { + struct hgcd_matrix M1; + mp_size_t scratch; + + p = 2*s - n + 1; + scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p); + + mpn_hgcd_matrix_init(&M1, n - p, tp); + if (mpn_hgcd_appr (ap + p, bp + p, n - p, &M1, tp + scratch)) + { + /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */ + ASSERT (M->n + 2 >= M1.n); + + /* Furthermore, assume M ends with a quotient (1, q; 0, 1), + then either q or q + 1 is a correct quotient, and M1 will + start with either (1, 0; 1, 1) or (2, 1; 1, 1). This + rules out the case that the size of M * M1 is much + smaller than the expected M->n + M1->n. */ + + ASSERT (M->n + M1.n < M->alloc); + + /* We need a bound for of M->n + M1.n. Let n be the original + input size. Then + + ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2 + + and it follows that + + M.n + M1.n <= ceil(n/2) + 1 + + Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the + amount of needed scratch space. */ + mpn_hgcd_matrix_mul (M, &M1, tp + scratch); + return 1; + } + } + + for(;;) + { + mp_size_t nn; + + ASSERT (n > s); + ASSERT (n <= 2*s); + + nn = mpn_hgcd_step (n, ap, bp, s, M, tp); + + if (!nn) + return success; + + n = nn; + success = 1; + } + } +} diff --git a/gmp-6.3.0/mpn/generic/hgcd_jacobi.c b/gmp-6.3.0/mpn/generic/hgcd_jacobi.c new file mode 100644 index 0000000..24014ce --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd_jacobi.c @@ -0,0 +1,243 @@ +/* hgcd_jacobi.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003-2005, 2008, 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* This file is almost a copy of hgcd.c, with some added calls to + mpn_jacobi_update */ + +struct hgcd_jacobi_ctx +{ + struct hgcd_matrix *M; + unsigned *bitsp; +}; + +static void +hgcd_jacobi_hook (void *p, mp_srcptr gp, mp_size_t gn, + mp_srcptr qp, mp_size_t qn, int d) +{ + ASSERT (!gp); + ASSERT (d >= 0); + + MPN_NORMALIZE (qp, qn); + if (qn > 0) + { + struct hgcd_jacobi_ctx *ctx = (struct hgcd_jacobi_ctx *) p; + /* NOTES: This is a bit ugly. A tp area is passed to + gcd_subdiv_step, which stores q at the start of that area. We + now use the rest. */ + mp_ptr tp = (mp_ptr) qp + qn; + + mpn_hgcd_matrix_update_q (ctx->M, qp, qn, d, tp); + *ctx->bitsp = mpn_jacobi_update (*ctx->bitsp, d, qp[0] & 3); + } +} + +/* Perform a few steps, using some of mpn_hgcd2, subtraction and + division. Reduces the size by almost one limb or more, but never + below the given size s. Return new size for a and b, or 0 if no + more steps are possible. + + If hgcd2 succeeds, needs temporary space for hgcd_matrix_mul_1, M->n + limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2 + fails, needs space for the quotient, qn <= n - s + 1 limbs, for and + hgcd_matrix_update_q, qn + (size of the appropriate column of M) <= + resulting size of M. + + If N is the input size to the calling hgcd, then s = floor(N/2) + + 1, M->n < N, qn + matrix size <= n - s + 1 + n - s = 2 (n - s) + 1 + < N, so N is sufficient. +*/ + +static mp_size_t +hgcd_jacobi_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s, + struct hgcd_matrix *M, unsigned *bitsp, mp_ptr tp) +{ + struct hgcd_matrix1 M1; + mp_limb_t mask; + mp_limb_t ah, al, bh, bl; + + ASSERT (n > s); + + mask = ap[n-1] | bp[n-1]; + ASSERT (mask > 0); + + if (n == s + 1) + { + if (mask < 4) + goto subtract; + + ah = ap[n-1]; al = ap[n-2]; + bh = bp[n-1]; bl = bp[n-2]; + } + else if (mask & GMP_NUMB_HIGHBIT) + { + ah = ap[n-1]; al = ap[n-2]; + bh = bp[n-1]; bl = bp[n-2]; + } + else + { + int shift; + + count_leading_zeros (shift, mask); + ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]); + al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]); + bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]); + bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]); + } + + /* Try an mpn_hgcd2 step */ + if (mpn_hgcd2_jacobi (ah, al, bh, bl, &M1, bitsp)) + { + /* Multiply M <- M * M1 */ + mpn_hgcd_matrix_mul_1 (M, &M1, tp); + + /* Can't swap inputs, so we need to copy. */ + MPN_COPY (tp, ap, n); + /* Multiply M1^{-1} (a;b) */ + return mpn_matrix22_mul1_inverse_vector (&M1, ap, tp, bp, n); + } + + subtract: + { + struct hgcd_jacobi_ctx ctx; + ctx.M = M; + ctx.bitsp = bitsp; + + return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_jacobi_hook, &ctx, tp); + } +} + +/* Reduces a,b until |a-b| fits in n/2 + 1 limbs. Constructs matrix M + with elements of size at most (n+1)/2 - 1. Returns new size of a, + b, or zero if no reduction is possible. */ + +/* Same scratch requirements as for mpn_hgcd. */ +mp_size_t +mpn_hgcd_jacobi (mp_ptr ap, mp_ptr bp, mp_size_t n, + struct hgcd_matrix *M, unsigned *bitsp, mp_ptr tp) +{ + mp_size_t s = n/2 + 1; + + mp_size_t nn; + int success = 0; + + if (n <= s) + /* Happens when n <= 2, a fairly uninteresting case but exercised + by the random inputs of the testsuite. */ + return 0; + + ASSERT ((ap[n-1] | bp[n-1]) > 0); + + ASSERT ((n+1)/2 - 1 < M->alloc); + + if (ABOVE_THRESHOLD (n, HGCD_THRESHOLD)) + { + mp_size_t n2 = (3*n)/4 + 1; + mp_size_t p = n/2; + + nn = mpn_hgcd_jacobi (ap + p, bp + p, n - p, M, bitsp, tp); + if (nn > 0) + { + /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1) + = 2 (n - 1) */ + n = mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp); + success = 1; + } + while (n > n2) + { + /* Needs n + 1 storage */ + nn = hgcd_jacobi_step (n, ap, bp, s, M, bitsp, tp); + if (!nn) + return success ? n : 0; + n = nn; + success = 1; + } + + if (n > s + 2) + { + struct hgcd_matrix M1; + mp_size_t scratch; + + p = 2*s - n + 1; + scratch = MPN_HGCD_MATRIX_INIT_ITCH (n-p); + + mpn_hgcd_matrix_init(&M1, n - p, tp); + nn = mpn_hgcd_jacobi (ap + p, bp + p, n - p, &M1, bitsp, tp + scratch); + if (nn > 0) + { + /* We always have max(M) > 2^{-(GMP_NUMB_BITS + 1)} max(M1) */ + ASSERT (M->n + 2 >= M1.n); + + /* Furthermore, assume M ends with a quotient (1, q; 0, 1), + then either q or q + 1 is a correct quotient, and M1 will + start with either (1, 0; 1, 1) or (2, 1; 1, 1). This + rules out the case that the size of M * M1 is much + smaller than the expected M->n + M1->n. */ + + ASSERT (M->n + M1.n < M->alloc); + + /* Needs 2 (p + M->n) <= 2 (2*s - n2 + 1 + n2 - s - 1) + = 2*s <= 2*(floor(n/2) + 1) <= n + 2. */ + n = mpn_hgcd_matrix_adjust (&M1, p + nn, ap, bp, p, tp + scratch); + + /* We need a bound for of M->n + M1.n. Let n be the original + input size. Then + + ceil(n/2) - 1 >= size of product >= M.n + M1.n - 2 + + and it follows that + + M.n + M1.n <= ceil(n/2) + 1 + + Then 3*(M.n + M1.n) + 5 <= 3 * ceil(n/2) + 8 is the + amount of needed scratch space. */ + mpn_hgcd_matrix_mul (M, &M1, tp + scratch); + success = 1; + } + } + } + + for (;;) + { + /* Needs s+3 < n */ + nn = hgcd_jacobi_step (n, ap, bp, s, M, bitsp, tp); + if (!nn) + return success ? n : 0; + + n = nn; + success = 1; + } +} diff --git a/gmp-6.3.0/mpn/generic/hgcd_matrix.c b/gmp-6.3.0/mpn/generic/hgcd_matrix.c new file mode 100644 index 0000000..54c795d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd_matrix.c @@ -0,0 +1,265 @@ +/* hgcd_matrix.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003-2005, 2008, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* For input of size n, matrix elements are of size at most ceil(n/2) + - 1, but we need two limbs extra. */ +void +mpn_hgcd_matrix_init (struct hgcd_matrix *M, mp_size_t n, mp_ptr p) +{ + mp_size_t s = (n+1)/2 + 1; + M->alloc = s; + M->n = 1; + MPN_ZERO (p, 4 * s); + M->p[0][0] = p; + M->p[0][1] = p + s; + M->p[1][0] = p + 2 * s; + M->p[1][1] = p + 3 * s; + + M->p[0][0][0] = M->p[1][1][0] = 1; +} + +/* Update column COL, adding in Q * column (1-COL). Temporary storage: + * qn + n <= M->alloc, where n is the size of the largest element in + * column 1 - COL. */ +void +mpn_hgcd_matrix_update_q (struct hgcd_matrix *M, mp_srcptr qp, mp_size_t qn, + unsigned col, mp_ptr tp) +{ + ASSERT (col < 2); + + if (qn == 1) + { + mp_limb_t q = qp[0]; + mp_limb_t c0, c1; + + c0 = mpn_addmul_1 (M->p[0][col], M->p[0][1-col], M->n, q); + c1 = mpn_addmul_1 (M->p[1][col], M->p[1][1-col], M->n, q); + + M->p[0][col][M->n] = c0; + M->p[1][col][M->n] = c1; + + M->n += (c0 | c1) != 0; + } + else + { + unsigned row; + + /* Carries for the unlikely case that we get both high words + from the multiplication and carries from the addition. */ + mp_limb_t c[2]; + mp_size_t n; + + /* The matrix will not necessarily grow in size by qn, so we + need normalization in order not to overflow M. */ + + for (n = M->n; n + qn > M->n; n--) + { + ASSERT (n > 0); + if (M->p[0][1-col][n-1] > 0 || M->p[1][1-col][n-1] > 0) + break; + } + + ASSERT (qn + n <= M->alloc); + + for (row = 0; row < 2; row++) + { + if (qn <= n) + mpn_mul (tp, M->p[row][1-col], n, qp, qn); + else + mpn_mul (tp, qp, qn, M->p[row][1-col], n); + + ASSERT (n + qn >= M->n); + c[row] = mpn_add (M->p[row][col], tp, n + qn, M->p[row][col], M->n); + } + + n += qn; + + if (c[0] | c[1]) + { + M->p[0][col][n] = c[0]; + M->p[1][col][n] = c[1]; + n++; + } + else + { + n -= (M->p[0][col][n-1] | M->p[1][col][n-1]) == 0; + ASSERT (n >= M->n); + } + M->n = n; + } + + ASSERT (M->n < M->alloc); +} + +/* Multiply M by M1 from the right. Since the M1 elements fit in + GMP_NUMB_BITS - 1 bits, M grows by at most one limb. Needs + temporary space M->n */ +void +mpn_hgcd_matrix_mul_1 (struct hgcd_matrix *M, const struct hgcd_matrix1 *M1, + mp_ptr tp) +{ + mp_size_t n0, n1; + + /* Could avoid copy by some swapping of pointers. */ + MPN_COPY (tp, M->p[0][0], M->n); + n0 = mpn_hgcd_mul_matrix1_vector (M1, M->p[0][0], tp, M->p[0][1], M->n); + MPN_COPY (tp, M->p[1][0], M->n); + n1 = mpn_hgcd_mul_matrix1_vector (M1, M->p[1][0], tp, M->p[1][1], M->n); + + /* Depends on zero initialization */ + M->n = MAX(n0, n1); + ASSERT (M->n < M->alloc); +} + +/* Multiply M by M1 from the right. Needs 3*(M->n + M1->n) + 5 limbs + of temporary storage (see mpn_matrix22_mul_itch). */ +void +mpn_hgcd_matrix_mul (struct hgcd_matrix *M, const struct hgcd_matrix *M1, + mp_ptr tp) +{ + mp_size_t n; + + /* About the new size of M:s elements. Since M1's diagonal elements + are > 0, no element can decrease. The new elements are of size + M->n + M1->n, one limb more or less. The computation of the + matrix product produces elements of size M->n + M1->n + 1. But + the true size, after normalization, may be three limbs smaller. + + The reason that the product has normalized size >= M->n + M1->n - + 2 is subtle. It depends on the fact that M and M1 can be factored + as products of (1,1; 0,1) and (1,0; 1,1), and that we can't have + M ending with a large power and M1 starting with a large power of + the same matrix. */ + + /* FIXME: Strassen multiplication gives only a small speedup. In FFT + multiplication range, this function could be sped up quite a lot + using invariance. */ + ASSERT (M->n + M1->n < M->alloc); + + ASSERT ((M->p[0][0][M->n-1] | M->p[0][1][M->n-1] + | M->p[1][0][M->n-1] | M->p[1][1][M->n-1]) > 0); + + ASSERT ((M1->p[0][0][M1->n-1] | M1->p[0][1][M1->n-1] + | M1->p[1][0][M1->n-1] | M1->p[1][1][M1->n-1]) > 0); + + mpn_matrix22_mul (M->p[0][0], M->p[0][1], + M->p[1][0], M->p[1][1], M->n, + M1->p[0][0], M1->p[0][1], + M1->p[1][0], M1->p[1][1], M1->n, tp); + + /* Index of last potentially non-zero limb, size is one greater. */ + n = M->n + M1->n; + + n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0); + n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0); + n -= ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) == 0); + + ASSERT ((M->p[0][0][n] | M->p[0][1][n] | M->p[1][0][n] | M->p[1][1][n]) > 0); + + M->n = n + 1; +} + +/* Multiplies the least significant p limbs of (a;b) by M^-1. + Temporary space needed: 2 * (p + M->n)*/ +mp_size_t +mpn_hgcd_matrix_adjust (const struct hgcd_matrix *M, + mp_size_t n, mp_ptr ap, mp_ptr bp, + mp_size_t p, mp_ptr tp) +{ + /* M^-1 (a;b) = (r11, -r01; -r10, r00) (a ; b) + = (r11 a - r01 b; - r10 a + r00 b */ + + mp_ptr t0 = tp; + mp_ptr t1 = tp + p + M->n; + mp_limb_t ah, bh; + mp_limb_t cy; + + ASSERT (p + M->n < n); + + /* First compute the two values depending on a, before overwriting a */ + + if (M->n >= p) + { + mpn_mul (t0, M->p[1][1], M->n, ap, p); + mpn_mul (t1, M->p[1][0], M->n, ap, p); + } + else + { + mpn_mul (t0, ap, p, M->p[1][1], M->n); + mpn_mul (t1, ap, p, M->p[1][0], M->n); + } + + /* Update a */ + MPN_COPY (ap, t0, p); + ah = mpn_add (ap + p, ap + p, n - p, t0 + p, M->n); + + if (M->n >= p) + mpn_mul (t0, M->p[0][1], M->n, bp, p); + else + mpn_mul (t0, bp, p, M->p[0][1], M->n); + + cy = mpn_sub (ap, ap, n, t0, p + M->n); + ASSERT (cy <= ah); + ah -= cy; + + /* Update b */ + if (M->n >= p) + mpn_mul (t0, M->p[0][0], M->n, bp, p); + else + mpn_mul (t0, bp, p, M->p[0][0], M->n); + + MPN_COPY (bp, t0, p); + bh = mpn_add (bp + p, bp + p, n - p, t0 + p, M->n); + cy = mpn_sub (bp, bp, n, t1, p + M->n); + ASSERT (cy <= bh); + bh -= cy; + + if (ah > 0 || bh > 0) + { + ap[n] = ah; + bp[n] = bh; + n++; + } + else + { + /* The subtraction can reduce the size by at most one limb. */ + if (ap[n-1] == 0 && bp[n-1] == 0) + n--; + } + ASSERT (ap[n-1] > 0 || bp[n-1] > 0); + return n; +} diff --git a/gmp-6.3.0/mpn/generic/hgcd_reduce.c b/gmp-6.3.0/mpn/generic/hgcd_reduce.c new file mode 100644 index 0000000..3aee77d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd_reduce.c @@ -0,0 +1,242 @@ +/* hgcd_reduce.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Computes R -= A * B. Result must be non-negative. Normalized down + to size an, and resulting size is returned. */ +static mp_size_t +submul (mp_ptr rp, mp_size_t rn, + mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn) +{ + mp_ptr tp; + TMP_DECL; + + ASSERT (bn > 0); + ASSERT (an >= bn); + ASSERT (rn >= an); + ASSERT (an + bn <= rn + 1); + + TMP_MARK; + tp = TMP_ALLOC_LIMBS (an + bn); + + mpn_mul (tp, ap, an, bp, bn); + ASSERT ((an + bn <= rn) || (tp[rn] == 0)); + ASSERT_NOCARRY (mpn_sub (rp, rp, rn, tp, an + bn - (an + bn > rn))); + TMP_FREE; + + while (rn > an && (rp[rn-1] == 0)) + rn--; + + return rn; +} + +/* Computes (a, b) <-- M^{-1} (a; b) */ +/* FIXME: + x Take scratch parameter, and figure out scratch need. + + x Use some fallback for small M->n? +*/ +static mp_size_t +hgcd_matrix_apply (const struct hgcd_matrix *M, + mp_ptr ap, mp_ptr bp, + mp_size_t n) +{ + mp_size_t an, bn, un, vn, nn; + mp_size_t mn[2][2]; + mp_size_t modn; + mp_ptr tp, sp, scratch; + mp_limb_t cy; + unsigned i, j; + + TMP_DECL; + + ASSERT ( (ap[n-1] | bp[n-1]) > 0); + + an = n; + MPN_NORMALIZE (ap, an); + bn = n; + MPN_NORMALIZE (bp, bn); + + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + { + mp_size_t k; + k = M->n; + MPN_NORMALIZE (M->p[i][j], k); + mn[i][j] = k; + } + + ASSERT (mn[0][0] > 0); + ASSERT (mn[1][1] > 0); + ASSERT ( (mn[0][1] | mn[1][0]) > 0); + + TMP_MARK; + + if (mn[0][1] == 0) + { + /* A unchanged, M = (1, 0; q, 1) */ + ASSERT (mn[0][0] == 1); + ASSERT (M->p[0][0][0] == 1); + ASSERT (mn[1][1] == 1); + ASSERT (M->p[1][1][0] == 1); + + /* Put B <-- B - q A */ + nn = submul (bp, bn, ap, an, M->p[1][0], mn[1][0]); + } + else if (mn[1][0] == 0) + { + /* B unchanged, M = (1, q; 0, 1) */ + ASSERT (mn[0][0] == 1); + ASSERT (M->p[0][0][0] == 1); + ASSERT (mn[1][1] == 1); + ASSERT (M->p[1][1][0] == 1); + + /* Put A <-- A - q * B */ + nn = submul (ap, an, bp, bn, M->p[0][1], mn[0][1]); + } + else + { + /* A = m00 a + m01 b ==> a <= A / m00, b <= A / m01. + B = m10 a + m11 b ==> a <= B / m10, b <= B / m11. */ + un = MIN (an - mn[0][0], bn - mn[1][0]) + 1; + vn = MIN (an - mn[0][1], bn - mn[1][1]) + 1; + + nn = MAX (un, vn); + /* In the range of interest, mulmod_bnm1 should always beat mullo. */ + modn = mpn_mulmod_bnm1_next_size (nn + 1); + + TMP_ALLOC_LIMBS_3 (tp, modn, + sp, modn, + scratch, mpn_mulmod_bnm1_itch (modn, modn, M->n)); + + ASSERT (n <= 2*modn); + + if (n > modn) + { + cy = mpn_add (ap, ap, modn, ap + modn, n - modn); + MPN_INCR_U (ap, modn, cy); + + cy = mpn_add (bp, bp, modn, bp + modn, n - modn); + MPN_INCR_U (bp, modn, cy); + + n = modn; + } + + mpn_mulmod_bnm1 (tp, modn, ap, n, M->p[1][1], mn[1][1], scratch); + mpn_mulmod_bnm1 (sp, modn, bp, n, M->p[0][1], mn[0][1], scratch); + + /* FIXME: Handle the small n case in some better way. */ + if (n + mn[1][1] < modn) + MPN_ZERO (tp + n + mn[1][1], modn - n - mn[1][1]); + if (n + mn[0][1] < modn) + MPN_ZERO (sp + n + mn[0][1], modn - n - mn[0][1]); + + cy = mpn_sub_n (tp, tp, sp, modn); + MPN_DECR_U (tp, modn, cy); + + ASSERT (mpn_zero_p (tp + nn, modn - nn)); + + mpn_mulmod_bnm1 (sp, modn, ap, n, M->p[1][0], mn[1][0], scratch); + MPN_COPY (ap, tp, nn); + mpn_mulmod_bnm1 (tp, modn, bp, n, M->p[0][0], mn[0][0], scratch); + + if (n + mn[1][0] < modn) + MPN_ZERO (sp + n + mn[1][0], modn - n - mn[1][0]); + if (n + mn[0][0] < modn) + MPN_ZERO (tp + n + mn[0][0], modn - n - mn[0][0]); + + cy = mpn_sub_n (tp, tp, sp, modn); + MPN_DECR_U (tp, modn, cy); + + ASSERT (mpn_zero_p (tp + nn, modn - nn)); + MPN_COPY (bp, tp, nn); + + while ( (ap[nn-1] | bp[nn-1]) == 0) + { + nn--; + ASSERT (nn > 0); + } + } + TMP_FREE; + + return nn; +} + +mp_size_t +mpn_hgcd_reduce_itch (mp_size_t n, mp_size_t p) +{ + mp_size_t itch; + if (BELOW_THRESHOLD (n, HGCD_REDUCE_THRESHOLD)) + { + itch = mpn_hgcd_itch (n-p); + + /* For arbitrary p, the storage for _adjust is 2*(p + M->n) = 2 * + (p + ceil((n-p)/2) - 1 <= n + p - 1 */ + if (itch < n + p - 1) + itch = n + p - 1; + } + else + { + itch = 2*(n-p) + mpn_hgcd_itch (n-p); + /* Currently, hgcd_matrix_apply allocates its own storage. */ + } + return itch; +} + +/* FIXME: Document storage need. */ +mp_size_t +mpn_hgcd_reduce (struct hgcd_matrix *M, + mp_ptr ap, mp_ptr bp, mp_size_t n, mp_size_t p, + mp_ptr tp) +{ + mp_size_t nn; + if (BELOW_THRESHOLD (n, HGCD_REDUCE_THRESHOLD)) + { + nn = mpn_hgcd (ap + p, bp + p, n - p, M, tp); + if (nn > 0) + /* Needs 2*(p + M->n) <= 2*(floor(n/2) + ceil(n/2) - 1) + = 2 (n - 1) */ + return mpn_hgcd_matrix_adjust (M, p + nn, ap, bp, p, tp); + } + else + { + MPN_COPY (tp, ap + p, n - p); + MPN_COPY (tp + n - p, bp + p, n - p); + if (mpn_hgcd_appr (tp, tp + n - p, n - p, M, tp + 2*(n-p))) + return hgcd_matrix_apply (M, ap, bp, n); + } + return 0; +} diff --git a/gmp-6.3.0/mpn/generic/hgcd_step.c b/gmp-6.3.0/mpn/generic/hgcd_step.c new file mode 100644 index 0000000..a978a88 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/hgcd_step.c @@ -0,0 +1,127 @@ +/* hgcd_step.c. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003-2005, 2008, 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +static void +hgcd_hook (void *p, mp_srcptr gp, mp_size_t gn, + mp_srcptr qp, mp_size_t qn, int d) +{ + ASSERT (!gp); + ASSERT (d >= 0); + ASSERT (d <= 1); + + MPN_NORMALIZE (qp, qn); + if (qn > 0) + { + struct hgcd_matrix *M = (struct hgcd_matrix *) p; + /* NOTES: This is a bit ugly. A tp area is passed to + gcd_subdiv_step, which stores q at the start of that area. We + now use the rest. */ + mp_ptr tp = (mp_ptr) qp + qn; + mpn_hgcd_matrix_update_q (M, qp, qn, d, tp); + } +} + +/* Perform a few steps, using some of mpn_hgcd2, subtraction and + division. Reduces the size by almost one limb or more, but never + below the given size s. Return new size for a and b, or 0 if no + more steps are possible. + + If hgcd2 succeeds, needs temporary space for hgcd_matrix_mul_1, M->n + limbs, and hgcd_mul_matrix1_inverse_vector, n limbs. If hgcd2 + fails, needs space for the quotient, qn <= n - s limbs, for and + hgcd_matrix_update_q, qn + (size of the appropriate column of M) <= + (resulting size of M) + 1. + + If N is the input size to the calling hgcd, then s = floor(N/2) + + 1, M->n < N, qn + product size <= n - s + n - s + 1 = 2 (n - s) + 1 + <= N. +*/ + +mp_size_t +mpn_hgcd_step (mp_size_t n, mp_ptr ap, mp_ptr bp, mp_size_t s, + struct hgcd_matrix *M, mp_ptr tp) +{ + struct hgcd_matrix1 M1; + mp_limb_t mask; + mp_limb_t ah, al, bh, bl; + + ASSERT (n > s); + + mask = ap[n-1] | bp[n-1]; + ASSERT (mask > 0); + + if (n == s + 1) + { + if (mask < 4) + goto subtract; + + ah = ap[n-1]; al = ap[n-2]; + bh = bp[n-1]; bl = bp[n-2]; + } + else if (mask & GMP_NUMB_HIGHBIT) + { + ah = ap[n-1]; al = ap[n-2]; + bh = bp[n-1]; bl = bp[n-2]; + } + else + { + int shift; + + count_leading_zeros (shift, mask); + ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]); + al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]); + bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]); + bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]); + } + + /* Try an mpn_hgcd2 step */ + if (mpn_hgcd2 (ah, al, bh, bl, &M1)) + { + /* Multiply M <- M * M1 */ + mpn_hgcd_matrix_mul_1 (M, &M1, tp); + + /* Can't swap inputs, so we need to copy. */ + MPN_COPY (tp, ap, n); + /* Multiply M1^{-1} (a;b) */ + return mpn_matrix22_mul1_inverse_vector (&M1, ap, tp, bp, n); + } + + subtract: + + return mpn_gcd_subdiv_step (ap, bp, n, s, hgcd_hook, M, tp); +} diff --git a/gmp-6.3.0/mpn/generic/invert.c b/gmp-6.3.0/mpn/generic/invert.c new file mode 100644 index 0000000..157ff2b --- /dev/null +++ b/gmp-6.3.0/mpn/generic/invert.c @@ -0,0 +1,86 @@ +/* invert.c -- Compute floor((B^{2n}-1)/U) - B^n. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright (C) 2007, 2009, 2010, 2012, 2014-2016 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +void +mpn_invert (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) +{ + ASSERT (n > 0); + ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); + ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); + ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); + ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); + + if (n == 1) + invert_limb (*ip, *dp); + else if (BELOW_THRESHOLD (n, INV_APPR_THRESHOLD)) + { + /* Maximum scratch needed by this branch: 2*n */ + mp_ptr xp; + + xp = scratch; /* 2 * n limbs */ + /* n > 1 here */ + MPN_FILL (xp, n, GMP_NUMB_MAX); + mpn_com (xp + n, dp, n); + if (n == 2) { + mpn_divrem_2 (ip, 0, xp, 4, dp); + } else { + gmp_pi1_t inv; + invert_pi1 (inv, dp[n-1], dp[n-2]); + /* FIXME: should we use dcpi1_div_q, for big sizes? */ + mpn_sbpi1_div_q (ip, xp, 2 * n, dp, n, inv.inv32); + } + } + else { /* Use approximated inverse; correct the result if needed. */ + mp_limb_t e; /* The possible error in the approximate inverse */ + + ASSERT ( mpn_invert_itch (n) >= mpn_invertappr_itch (n) ); + e = mpn_ni_invertappr (ip, dp, n, scratch); + + if (UNLIKELY (e)) { /* Assume the error can only be "0" (no error) or "1". */ + /* Code to detect and correct the "off by one" approximation. */ + mpn_mul_n (scratch, ip, dp, n); + e = mpn_add_n (scratch, scratch, dp, n); /* FIXME: we only need e.*/ + if (LIKELY(e)) /* The high part can not give a carry by itself. */ + e = mpn_add_nc (scratch + n, scratch + n, dp, n, e); /* FIXME:e */ + /* If the value was wrong (no carry), correct it (increment). */ + e ^= CNST_LIMB (1); + MPN_INCR_U (ip, n, e); + } + } +} diff --git a/gmp-6.3.0/mpn/generic/invertappr.c b/gmp-6.3.0/mpn/generic/invertappr.c new file mode 100644 index 0000000..3be5596 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/invertappr.c @@ -0,0 +1,300 @@ +/* mpn_invertappr and helper functions. Compute I such that + floor((B^{2n}-1)/U - 1 <= I + B^n <= floor((B^{2n}-1)/U. + + Contributed to the GNU project by Marco Bodrato. + + The algorithm used here was inspired by ApproximateReciprocal from "Modern + Computer Arithmetic", by Richard P. Brent and Paul Zimmermann. Special + thanks to Paul Zimmermann for his very valuable suggestions on all the + theoretical aspects during the work on this code. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright (C) 2007, 2009, 2010, 2012, 2015, 2016 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* FIXME: The iterative version splits the operand in two slightly unbalanced + parts, the use of log_2 (or counting the bits) underestimate the maximum + number of iterations. */ + +#if TUNE_PROGRAM_BUILD +#define NPOWS \ + ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t))) +#define MAYBE_dcpi1_divappr 1 +#else +#define NPOWS \ + ((sizeof(mp_size_t) > 6 ? 48 : 8*sizeof(mp_size_t)) - LOG2C (INV_NEWTON_THRESHOLD)) +#define MAYBE_dcpi1_divappr \ + (INV_NEWTON_THRESHOLD < DC_DIVAPPR_Q_THRESHOLD) +#if (INV_NEWTON_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) && \ + (INV_APPR_THRESHOLD > INV_MULMOD_BNM1_THRESHOLD) +#undef INV_MULMOD_BNM1_THRESHOLD +#define INV_MULMOD_BNM1_THRESHOLD 0 /* always when Newton */ +#endif +#endif + +/* All the three functions mpn{,_bc,_ni}_invertappr (ip, dp, n, scratch), take + the strictly normalised value {dp,n} (i.e., most significant bit must be set) + as an input, and compute {ip,n}: the approximate reciprocal of {dp,n}. + + Let e = mpn*_invertappr (ip, dp, n, scratch) be the returned value; the + following conditions are satisfied by the output: + 0 <= e <= 1; + {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1+e) . + I.e. e=0 means that the result {ip,n} equals the one given by mpn_invert. + e=1 means that the result _may_ be one less than expected. + + The _bc version returns e=1 most of the time. + The _ni version should return e=0 most of the time; only about 1% of + possible random input should give e=1. + + When the strict result is needed, i.e., e=0 in the relation above: + {dp,n}*(B^n+{ip,n}) < B^{2n} <= {dp,n}*(B^n+{ip,n}+1) ; + the function mpn_invert (ip, dp, n, scratch) should be used instead. */ + +/* Maximum scratch needed by this branch (at xp): 2*n */ +static mp_limb_t +mpn_bc_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr xp) +{ + ASSERT (n > 0); + ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); + ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); + ASSERT (! MPN_OVERLAP_P (ip, n, xp, mpn_invertappr_itch(n))); + ASSERT (! MPN_OVERLAP_P (dp, n, xp, mpn_invertappr_itch(n))); + + /* Compute a base value of r limbs. */ + if (n == 1) + invert_limb (*ip, *dp); + else { + /* n > 1 here */ + MPN_FILL (xp, n, GMP_NUMB_MAX); + mpn_com (xp + n, dp, n); + + /* Now xp contains B^2n - {dp,n}*B^n - 1 */ + + /* FIXME: if mpn_*pi1_divappr_q handles n==2, use it! */ + if (n == 2) { + mpn_divrem_2 (ip, 0, xp, 4, dp); + } else { + gmp_pi1_t inv; + invert_pi1 (inv, dp[n-1], dp[n-2]); + if (! MAYBE_dcpi1_divappr + || BELOW_THRESHOLD (n, DC_DIVAPPR_Q_THRESHOLD)) + mpn_sbpi1_divappr_q (ip, xp, 2 * n, dp, n, inv.inv32); + else + mpn_dcpi1_divappr_q (ip, xp, 2 * n, dp, n, &inv); + MPN_DECR_U(ip, n, CNST_LIMB (1)); + return 1; + } + } + return 0; +} + +/* mpn_ni_invertappr: computes the approximate reciprocal using Newton's + iterations (at least one). + + Inspired by Algorithm "ApproximateReciprocal", published in "Modern Computer + Arithmetic" by Richard P. Brent and Paul Zimmermann, algorithm 3.5, page 121 + in version 0.4 of the book. + + Some adaptations were introduced, to allow product mod B^m-1 and return the + value e. + + We introduced a correction in such a way that "the value of + B^{n+h}-T computed at step 8 cannot exceed B^n-1" (the book reads + "2B^n-1"). + + Maximum scratch needed by this branch <= 2*n, but have to fit 3*rn + in the scratch, i.e. 3*rn <= 2*n: we require n>4. + + We use a wrapped product modulo B^m-1. NOTE: is there any normalisation + problem for the [0] class? It shouldn't: we compute 2*|A*X_h - B^{n+h}| < + B^m-1. We may get [0] if and only if we get AX_h = B^{n+h}. This can + happen only if A=B^{n}/2, but this implies X_h = B^{h}*2-1 i.e., AX_h = + B^{n+h} - A, then we get into the "negative" branch, where X_h is not + incremented (because A < B^n). + + FIXME: the scratch for mulmod_bnm1 does not currently fit in the scratch, it + is allocated apart. + */ + +mp_limb_t +mpn_ni_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) +{ + mp_limb_t cy; + mp_size_t rn, mn; + mp_size_t sizes[NPOWS], *sizp; + mp_ptr tp; + TMP_DECL; +#define xp scratch + + ASSERT (n > 4); + ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); + ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); + ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); + ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); + + /* Compute the computation precisions from highest to lowest, leaving the + base case size in 'rn'. */ + sizp = sizes; + rn = n; + do { + *sizp = rn; + rn = (rn >> 1) + 1; + ++sizp; + } while (ABOVE_THRESHOLD (rn, INV_NEWTON_THRESHOLD)); + + /* We search the inverse of 0.{dp,n}, we compute it as 1.{ip,n} */ + dp += n; + ip += n; + + /* Compute a base value of rn limbs. */ + mpn_bc_invertappr (ip - rn, dp - rn, rn, scratch); + + TMP_MARK; + + if (ABOVE_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD)) + { + mn = mpn_mulmod_bnm1_next_size (n + 1); + tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (mn, n, (n >> 1) + 1)); + } + /* Use Newton's iterations to get the desired precision.*/ + + while (1) { + n = *--sizp; + /* + v n v + +----+--+ + ^ rn ^ + */ + + /* Compute i_jd . */ + if (BELOW_THRESHOLD (n, INV_MULMOD_BNM1_THRESHOLD) + || ((mn = mpn_mulmod_bnm1_next_size (n + 1)) > (n + rn))) { + /* FIXME: We do only need {xp,n+1}*/ + mpn_mul (xp, dp - n, n, ip - rn, rn); + mpn_add_n (xp + rn, xp + rn, dp - n, n - rn + 1); + cy = CNST_LIMB(1); /* Remember we truncated, Mod B^(n+1) */ + /* We computed (truncated) {xp,n+1} <- 1.{ip,rn} * 0.{dp,n} */ + } else { /* Use B^mn-1 wraparound */ + mpn_mulmod_bnm1 (xp, mn, dp - n, n, ip - rn, rn, tp); + /* We computed {xp,mn} <- {ip,rn} * {dp,n} mod (B^mn-1) */ + /* We know that 2*|ip*dp + dp*B^rn - B^{rn+n}| < B^mn-1 */ + /* Add dp*B^rn mod (B^mn-1) */ + ASSERT (n >= mn - rn); + cy = mpn_add_n (xp + rn, xp + rn, dp - n, mn - rn); + cy = mpn_add_nc (xp, xp, dp - (n - (mn - rn)), n - (mn - rn), cy); + /* Subtract B^{rn+n}, maybe only compensate the carry*/ + xp[mn] = CNST_LIMB (1); /* set a limit for DECR_U */ + MPN_DECR_U (xp + rn + n - mn, 2 * mn + 1 - rn - n, CNST_LIMB (1) - cy); + MPN_DECR_U (xp, mn, CNST_LIMB (1) - xp[mn]); /* if DECR_U eroded xp[mn] */ + cy = CNST_LIMB(0); /* Remember we are working Mod B^mn-1 */ + } + + if (xp[n] < CNST_LIMB (2)) { /* "positive" residue class */ + cy = xp[n]; /* 0 <= cy <= 1 here. */ +#if HAVE_NATIVE_mpn_sublsh1_n + if (cy++) { + if (mpn_cmp (xp, dp - n, n) > 0) { + mp_limb_t chk; + chk = mpn_sublsh1_n (xp, xp, dp - n, n); + ASSERT (chk == xp[n]); + ++ cy; + } else + ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n)); + } +#else /* no mpn_sublsh1_n*/ + if (cy++ && !mpn_sub_n (xp, xp, dp - n, n)) { + ASSERT_CARRY (mpn_sub_n (xp, xp, dp - n, n)); + ++cy; + } +#endif + /* 1 <= cy <= 3 here. */ +#if HAVE_NATIVE_mpn_rsblsh1_n + if (mpn_cmp (xp, dp - n, n) > 0) { + ASSERT_NOCARRY (mpn_rsblsh1_n (xp + n, xp, dp - n, n)); + ++cy; + } else + ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0)); +#else /* no mpn_rsblsh1_n*/ + if (mpn_cmp (xp, dp - n, n) > 0) { + ASSERT_NOCARRY (mpn_sub_n (xp, xp, dp - n, n)); + ++cy; + } + ASSERT_NOCARRY (mpn_sub_nc (xp + 2 * n - rn, dp - rn, xp + n - rn, rn, mpn_cmp (xp, dp - n, n - rn) > 0)); +#endif + MPN_DECR_U(ip - rn, rn, cy); /* 1 <= cy <= 4 here. */ + } else { /* "negative" residue class */ + ASSERT (xp[n] >= GMP_NUMB_MAX - CNST_LIMB(1)); + MPN_DECR_U(xp, n + 1, cy); + if (xp[n] != GMP_NUMB_MAX) { + MPN_INCR_U(ip - rn, rn, CNST_LIMB (1)); + ASSERT_CARRY (mpn_add_n (xp, xp, dp - n, n)); + } + mpn_com (xp + 2 * n - rn, xp + n - rn, rn); + } + + /* Compute x_ju_j. FIXME:We need {xp+rn,rn}, mulhi? */ + mpn_mul_n (xp, xp + 2 * n - rn, ip - rn, rn); + cy = mpn_add_n (xp + rn, xp + rn, xp + 2 * n - rn, 2 * rn - n); + cy = mpn_add_nc (ip - n, xp + 3 * rn - n, xp + n + rn, n - rn, cy); + MPN_INCR_U (ip - rn, rn, cy); + if (sizp == sizes) { /* Get out of the cycle */ + /* Check for possible carry propagation from below. */ + cy = xp[3 * rn - n - 1] > GMP_NUMB_MAX - CNST_LIMB (7); /* Be conservative. */ + /* cy = mpn_add_1 (xp + rn, xp + rn, 2*rn - n, 4); */ + break; + } + rn = n; + } + TMP_FREE; + + return cy; +#undef xp +} + +mp_limb_t +mpn_invertappr (mp_ptr ip, mp_srcptr dp, mp_size_t n, mp_ptr scratch) +{ + ASSERT (n > 0); + ASSERT (dp[n-1] & GMP_NUMB_HIGHBIT); + ASSERT (! MPN_OVERLAP_P (ip, n, dp, n)); + ASSERT (! MPN_OVERLAP_P (ip, n, scratch, mpn_invertappr_itch(n))); + ASSERT (! MPN_OVERLAP_P (dp, n, scratch, mpn_invertappr_itch(n))); + + if (BELOW_THRESHOLD (n, INV_NEWTON_THRESHOLD)) + return mpn_bc_invertappr (ip, dp, n, scratch); + else + return mpn_ni_invertappr (ip, dp, n, scratch); +} diff --git a/gmp-6.3.0/mpn/generic/jacbase.c b/gmp-6.3.0/mpn/generic/jacbase.c new file mode 100644 index 0000000..391ceac --- /dev/null +++ b/gmp-6.3.0/mpn/generic/jacbase.c @@ -0,0 +1,242 @@ +/* mpn_jacobi_base -- limb/limb Jacobi symbol with restricted arguments. + + THIS INTERFACE IS PRELIMINARY AND MIGHT DISAPPEAR OR BE SUBJECT TO + INCOMPATIBLE CHANGES IN A FUTURE RELEASE OF GMP. + +Copyright 1999-2002, 2010, 2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* Use the simple loop by default. The generic count_trailing_zeros is not + very fast, and the extra trickery of method 3 has proven to be less use + than might have been though. */ +#ifndef JACOBI_BASE_METHOD +#define JACOBI_BASE_METHOD 2 +#endif + + +/* Use count_trailing_zeros. */ +#if JACOBI_BASE_METHOD == 1 +#define PROCESS_TWOS_ANY \ + { \ + mp_limb_t twos; \ + count_trailing_zeros (twos, a); \ + result_bit1 ^= JACOBI_TWOS_U_BIT1 (twos, b); \ + a >>= twos; \ + } +#define PROCESS_TWOS_EVEN PROCESS_TWOS_ANY +#endif + +/* Use a simple loop. A disadvantage of this is that there's a branch on a + 50/50 chance of a 0 or 1 low bit. */ +#if JACOBI_BASE_METHOD == 2 +#define PROCESS_TWOS_EVEN \ + { \ + int two; \ + two = JACOBI_TWO_U_BIT1 (b); \ + do \ + { \ + a >>= 1; \ + result_bit1 ^= two; \ + ASSERT (a != 0); \ + } \ + while ((a & 1) == 0); \ + } +#define PROCESS_TWOS_ANY \ + if ((a & 1) == 0) \ + PROCESS_TWOS_EVEN; +#endif + +/* Process one bit arithmetically, then a simple loop. This cuts the loop + condition down to a 25/75 chance, which should branch predict better. + The CPU will need a reasonable variable left shift. */ +#if JACOBI_BASE_METHOD == 3 +#define PROCESS_TWOS_EVEN \ + { \ + int two, mask, shift; \ + \ + two = JACOBI_TWO_U_BIT1 (b); \ + mask = (~a & 2); \ + a >>= 1; \ + \ + shift = (~a & 1); \ + a >>= shift; \ + result_bit1 ^= two ^ (two & mask); \ + \ + while ((a & 1) == 0) \ + { \ + a >>= 1; \ + result_bit1 ^= two; \ + ASSERT (a != 0); \ + } \ + } +#define PROCESS_TWOS_ANY \ + { \ + int two, mask, shift; \ + \ + two = JACOBI_TWO_U_BIT1 (b); \ + shift = (~a & 1); \ + a >>= shift; \ + \ + mask = shift << 1; \ + result_bit1 ^= (two & mask); \ + \ + while ((a & 1) == 0) \ + { \ + a >>= 1; \ + result_bit1 ^= two; \ + ASSERT (a != 0); \ + } \ + } +#endif + +#if JACOBI_BASE_METHOD < 4 +/* Calculate the value of the Jacobi symbol (a/b) of two mp_limb_t's, but + with a restricted range of inputs accepted, namely b>1, b odd. + + The initial result_bit1 is taken as a parameter for the convenience of + mpz_kronecker_ui() et al. The sign changes both here and in those + routines accumulate nicely in bit 1, see the JACOBI macros. + + The return value here is the normal +1, 0, or -1. Note that +1 and -1 + have bit 1 in the "BIT1" sense, which could be useful if the caller is + accumulating it into some extended calculation. + + Duplicating the loop body to avoid the MP_LIMB_T_SWAP(a,b) would be + possible, but a couple of tests suggest it's not a significant speedup, + and may even be a slowdown, so what's here is good enough for now. */ + +int +mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int result_bit1) +{ + ASSERT (b & 1); /* b odd */ + ASSERT (b != 1); + + if (a == 0) + return 0; + + PROCESS_TWOS_ANY; + if (a == 1) + goto done; + + if (a >= b) + goto a_gt_b; + + for (;;) + { + result_bit1 ^= JACOBI_RECIP_UU_BIT1 (a, b); + MP_LIMB_T_SWAP (a, b); + + a_gt_b: + do + { + /* working on (a/b), a,b odd, a>=b */ + ASSERT (a & 1); + ASSERT (b & 1); + ASSERT (a >= b); + + if ((a -= b) == 0) + return 0; + + PROCESS_TWOS_EVEN; + if (a == 1) + goto done; + } + while (a >= b); + } + + done: + return JACOBI_BIT1_TO_PN (result_bit1); +} +#endif + +#if JACOBI_BASE_METHOD == 4 +/* Computes (a/b) for odd b > 1 and any a. The initial bit is taken as a + * parameter. We have no need for the convention that the sign is in + * bit 1, internally we use bit 0. */ + +/* FIXME: Could try table-based count_trailing_zeros. */ +int +mpn_jacobi_base (mp_limb_t a, mp_limb_t b, int bit) +{ + int c; + + ASSERT (b & 1); + ASSERT (b > 1); + + if (a == 0) + /* This is the only line which depends on b > 1 */ + return 0; + + bit >>= 1; + + /* Below, we represent a and b shifted right so that the least + significant one bit is implicit. */ + + b >>= 1; + + count_trailing_zeros (c, a); + bit ^= c & (b ^ (b >> 1)); + + /* We may have c==GMP_LIMB_BITS-1, so we can't use a>>c+1. */ + a >>= c; + a >>= 1; + + do + { + mp_limb_t t = a - b; + mp_limb_t bgta = LIMB_HIGHBIT_TO_MASK (t); + + if (t == 0) + return 0; + + /* If b > a, invoke reciprocity */ + bit ^= (bgta & a & b); + + /* b <-- min (a, b) */ + b += (bgta & t); + + /* a <-- |a - b| */ + a = (t ^ bgta) - bgta; + + /* Number of trailing zeros is the same no matter if we look at + * t or a, but using t gives more parallelism. */ + count_trailing_zeros (c, t); + c ++; + /* (2/b) = -1 if b = 3 or 5 mod 8 */ + bit ^= c & (b ^ (b >> 1)); + a >>= c; + } + while (a > 0); + + return 1-2*(bit & 1); +} +#endif /* JACOBI_BASE_METHOD == 4 */ diff --git a/gmp-6.3.0/mpn/generic/jacobi.c b/gmp-6.3.0/mpn/generic/jacobi.c new file mode 100644 index 0000000..d98b126 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/jacobi.c @@ -0,0 +1,294 @@ +/* jacobi.c + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1996, 1998, 2000-2004, 2008, 2010, 2011 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef JACOBI_DC_THRESHOLD +#define JACOBI_DC_THRESHOLD GCD_DC_THRESHOLD +#endif + +/* Schönhage's rules: + * + * Assume r0 = r1 q1 + r2, with r0 odd, and r1 = q2 r2 + r3 + * + * If r1 is odd, then + * + * (r1 | r0) = s(r1, r0) (r0 | r1) = s(r1, r0) (r2, r1) + * + * where s(x,y) = (-1)^{(x-1)(y-1)/4} = (-1)^[x = y = 3 (mod 4)]. + * + * If r1 is even, r2 must be odd. We have + * + * (r1 | r0) = (r1 - r0 | r0) = (-1)^(r0-1)/2 (r0 - r1 | r0) + * = (-1)^(r0-1)/2 s(r0, r0 - r1) (r0 | r0 - r1) + * = (-1)^(r0-1)/2 s(r0, r0 - r1) (r1 | r0 - r1) + * + * Now, if r1 = 0 (mod 4), then the sign factor is +1, and repeating + * q1 times gives + * + * (r1 | r0) = (r1 | r2) = (r3 | r2) + * + * On the other hand, if r1 = 2 (mod 4), the sign factor is + * (-1)^{(r0-1)/2}, and repeating q1 times gives the exponent + * + * (r0-1)/2 + (r0-r1-1)/2 + ... + (r0 - (q1-1) r1)/2 + * = q1 (r0-1)/2 + q1 (q1-1)/2 + * + * and we can summarize the even case as + * + * (r1 | r0) = t(r1, r0, q1) (r3 | r2) + * + * where t(x,y,q) = (-1)^{[x = 2 (mod 4)] (q(y-1)/2 + y(q-1)/2)} + * + * What about termination? The remainder sequence ends with (0|1) = 1 + * (or (0 | r) = 0 if r != 1). What are the possible cases? If r1 is + * odd, r2 may be zero. If r1 is even, then r2 = r0 - q1 r1 is odd and + * hence non-zero. We may have r3 = r1 - q2 r2 = 0. + * + * Examples: (11|15) = - (15|11) = - (4|11) + * (4|11) = (4| 3) = (1| 3) + * (1| 3) = (3|1) = (0|1) = 1 + * + * (2|7) = (2|1) = (0|1) = 1 + * + * Detail: (2|7) = (2-7|7) = (-1|7)(5|7) = -(7|5) = -(2|5) + * (2|5) = (2-5|5) = (-1|5)(3|5) = (5|3) = (2|3) + * (2|3) = (2-3|3) = (-1|3)(1|3) = -(3|1) = -(2|1) + * + */ + +/* In principle, the state consists of four variables: e (one bit), a, + b (two bits each), d (one bit). Collected factors are (-1)^e. a and + b are the least significant bits of the current remainders. d + (denominator) is 0 if we're currently subtracting multiplies of a + from b, and 1 if we're subtracting b from a. + + e is stored in the least significant bit, while a, b and d are + coded as only 13 distinct values in bits 1-4, according to the + following table. For rows not mentioning d, the value is either + implied, or it doesn't matter. */ + +#if WANT_ASSERT +static const struct +{ + unsigned char a; + unsigned char b; +} decode_table[13] = { + /* 0 */ { 0, 1 }, + /* 1 */ { 0, 3 }, + /* 2 */ { 1, 1 }, + /* 3 */ { 1, 3 }, + /* 4 */ { 2, 1 }, + /* 5 */ { 2, 3 }, + /* 6 */ { 3, 1 }, + /* 7 */ { 3, 3 }, /* d = 1 */ + /* 8 */ { 1, 0 }, + /* 9 */ { 1, 2 }, + /* 10 */ { 3, 0 }, + /* 11 */ { 3, 2 }, + /* 12 */ { 3, 3 }, /* d = 0 */ +}; +#define JACOBI_A(bits) (decode_table[(bits)>>1].a) +#define JACOBI_B(bits) (decode_table[(bits)>>1].b) +#endif /* WANT_ASSERT */ + +const unsigned char jacobi_table[208] = { +#include "jacobitab.h" +}; + +#define BITS_FAIL 31 + +static void +jacobi_hook (void *p, mp_srcptr gp, mp_size_t gn, + mp_srcptr qp, mp_size_t qn, int d) +{ + unsigned *bitsp = (unsigned *) p; + + if (gp) + { + ASSERT (gn > 0); + if (gn != 1 || gp[0] != 1) + { + *bitsp = BITS_FAIL; + return; + } + } + + if (qp) + { + ASSERT (qn > 0); + ASSERT (d >= 0); + *bitsp = mpn_jacobi_update (*bitsp, d, qp[0] & 3); + } +} + +#define CHOOSE_P(n) (2*(n) / 3) + +int +mpn_jacobi_n (mp_ptr ap, mp_ptr bp, mp_size_t n, unsigned bits) +{ + mp_size_t scratch; + mp_size_t matrix_scratch; + mp_ptr tp; + + TMP_DECL; + + ASSERT (n > 0); + ASSERT ( (ap[n-1] | bp[n-1]) > 0); + ASSERT ( (bp[0] | ap[0]) & 1); + + /* FIXME: Check for small sizes first, before setting up temporary + storage etc. */ + scratch = MPN_GCD_SUBDIV_STEP_ITCH(n); + + if (ABOVE_THRESHOLD (n, JACOBI_DC_THRESHOLD)) + { + mp_size_t hgcd_scratch; + mp_size_t update_scratch; + mp_size_t p = CHOOSE_P (n); + mp_size_t dc_scratch; + + matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p); + hgcd_scratch = mpn_hgcd_itch (n - p); + update_scratch = p + n - 1; + + dc_scratch = matrix_scratch + MAX(hgcd_scratch, update_scratch); + if (dc_scratch > scratch) + scratch = dc_scratch; + } + + TMP_MARK; + tp = TMP_ALLOC_LIMBS(scratch); + + while (ABOVE_THRESHOLD (n, JACOBI_DC_THRESHOLD)) + { + struct hgcd_matrix M; + mp_size_t p = 2*n/3; + mp_size_t matrix_scratch = MPN_HGCD_MATRIX_INIT_ITCH (n - p); + mp_size_t nn; + mpn_hgcd_matrix_init (&M, n - p, tp); + + nn = mpn_hgcd_jacobi (ap + p, bp + p, n - p, &M, &bits, + tp + matrix_scratch); + if (nn > 0) + { + ASSERT (M.n <= (n - p - 1)/2); + ASSERT (M.n + p <= (p + n - 1) / 2); + /* Temporary storage 2 (p + M->n) <= p + n - 1. */ + n = mpn_hgcd_matrix_adjust (&M, p + nn, ap, bp, p, tp + matrix_scratch); + } + else + { + /* Temporary storage n */ + n = mpn_gcd_subdiv_step (ap, bp, n, 0, jacobi_hook, &bits, tp); + if (!n) + { + TMP_FREE; + return bits == BITS_FAIL ? 0 : mpn_jacobi_finish (bits); + } + } + } + + while (n > 2) + { + struct hgcd_matrix1 M; + mp_limb_t ah, al, bh, bl; + mp_limb_t mask; + + mask = ap[n-1] | bp[n-1]; + ASSERT (mask > 0); + + if (mask & GMP_NUMB_HIGHBIT) + { + ah = ap[n-1]; al = ap[n-2]; + bh = bp[n-1]; bl = bp[n-2]; + } + else + { + int shift; + + count_leading_zeros (shift, mask); + ah = MPN_EXTRACT_NUMB (shift, ap[n-1], ap[n-2]); + al = MPN_EXTRACT_NUMB (shift, ap[n-2], ap[n-3]); + bh = MPN_EXTRACT_NUMB (shift, bp[n-1], bp[n-2]); + bl = MPN_EXTRACT_NUMB (shift, bp[n-2], bp[n-3]); + } + + /* Try an mpn_nhgcd2 step */ + if (mpn_hgcd2_jacobi (ah, al, bh, bl, &M, &bits)) + { + n = mpn_matrix22_mul1_inverse_vector (&M, tp, ap, bp, n); + MP_PTR_SWAP (ap, tp); + } + else + { + /* mpn_hgcd2 has failed. Then either one of a or b is very + small, or the difference is very small. Perform one + subtraction followed by one division. */ + n = mpn_gcd_subdiv_step (ap, bp, n, 0, &jacobi_hook, &bits, tp); + if (!n) + { + TMP_FREE; + return bits == BITS_FAIL ? 0 : mpn_jacobi_finish (bits); + } + } + } + + if (bits >= 16) + MP_PTR_SWAP (ap, bp); + + ASSERT (bp[0] & 1); + + if (n == 1) + { + mp_limb_t al, bl; + al = ap[0]; + bl = bp[0]; + + TMP_FREE; + if (bl == 1) + return 1 - 2*(bits & 1); + else + return mpn_jacobi_base (al, bl, bits << 1); + } + + else + { + int res = mpn_jacobi_2 (ap, bp, bits & 1); + TMP_FREE; + return res; + } +} diff --git a/gmp-6.3.0/mpn/generic/jacobi_2.c b/gmp-6.3.0/mpn/generic/jacobi_2.c new file mode 100644 index 0000000..028b8a4 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/jacobi_2.c @@ -0,0 +1,351 @@ +/* jacobi_2.c + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1996, 1998, 2000-2004, 2008, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef JACOBI_2_METHOD +#define JACOBI_2_METHOD 2 +#endif + +/* Computes (a / b) where b is odd, and a and b are otherwise arbitrary + two-limb numbers. */ +#if JACOBI_2_METHOD == 1 +int +mpn_jacobi_2 (mp_srcptr ap, mp_srcptr bp, unsigned bit) +{ + mp_limb_t ah, al, bh, bl; + int c; + + al = ap[0]; + ah = ap[1]; + bl = bp[0]; + bh = bp[1]; + + ASSERT (bl & 1); + + bl = ((bh << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK) | (bl >> 1); + bh >>= 1; + + if ( (bh | bl) == 0) + return 1 - 2*(bit & 1); + + if ( (ah | al) == 0) + return 0; + + if (al == 0) + { + al = ah; + ah = 0; + bit ^= GMP_NUMB_BITS & (bl ^ (bl >> 1)); + } + count_trailing_zeros (c, al); + bit ^= c & (bl ^ (bl >> 1)); + + c++; + if (UNLIKELY (c == GMP_NUMB_BITS)) + { + al = ah; + ah = 0; + } + else + { + al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c); + ah >>= c; + } + while ( (ah | bh) > 0) + { + mp_limb_t th, tl; + mp_limb_t bgta; + + sub_ddmmss (th, tl, ah, al, bh, bl); + if ( (tl | th) == 0) + return 0; + + bgta = LIMB_HIGHBIT_TO_MASK (th); + + /* If b > a, invoke reciprocity */ + bit ^= (bgta & al & bl); + + /* b <-- min (a, b) */ + add_ssaaaa (bh, bl, bh, bl, th & bgta, tl & bgta); + + if ( (bh | bl) == 0) + return 1 - 2*(bit & 1); + + /* a <-- |a - b| */ + al = (bgta ^ tl) - bgta; + ah = (bgta ^ th); + + if (UNLIKELY (al == 0)) + { + /* If b > a, al == 0 implies that we have a carry to + propagate. */ + al = ah - bgta; + ah = 0; + bit ^= GMP_NUMB_BITS & (bl ^ (bl >> 1)); + } + count_trailing_zeros (c, al); + c++; + bit ^= c & (bl ^ (bl >> 1)); + + if (UNLIKELY (c == GMP_NUMB_BITS)) + { + al = ah; + ah = 0; + } + else + { + al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c); + ah >>= c; + } + } + + ASSERT (bl > 0); + + while ( (al | bl) & GMP_LIMB_HIGHBIT) + { + /* Need an extra comparison to get the mask. */ + mp_limb_t t = al - bl; + mp_limb_t bgta = - (bl > al); + + if (t == 0) + return 0; + + /* If b > a, invoke reciprocity */ + bit ^= (bgta & al & bl); + + /* b <-- min (a, b) */ + bl += (bgta & t); + + /* a <-- |a - b| */ + al = (t ^ bgta) - bgta; + + /* Number of trailing zeros is the same no matter if we look at + * t or a, but using t gives more parallelism. */ + count_trailing_zeros (c, t); + c ++; + /* (2/b) = -1 if b = 3 or 5 mod 8 */ + bit ^= c & (bl ^ (bl >> 1)); + + if (UNLIKELY (c == GMP_NUMB_BITS)) + return 1 - 2*(bit & 1); + + al >>= c; + } + + /* Here we have a little impedance mismatch. Better to inline it? */ + return mpn_jacobi_base (2*al+1, 2*bl+1, bit << 1); +} +#elif JACOBI_2_METHOD == 2 +int +mpn_jacobi_2 (mp_srcptr ap, mp_srcptr bp, unsigned bit) +{ + mp_limb_t ah, al, bh, bl; + int c; + + al = ap[0]; + ah = ap[1]; + bl = bp[0]; + bh = bp[1]; + + ASSERT (bl & 1); + + /* Use bit 1. */ + bit <<= 1; + + if (bh == 0 && bl == 1) + /* (a|1) = 1 */ + return 1 - (bit & 2); + + if (al == 0) + { + if (ah == 0) + /* (0|b) = 0, b > 1 */ + return 0; + + count_trailing_zeros (c, ah); + bit ^= ((GMP_NUMB_BITS + c) << 1) & (bl ^ (bl >> 1)); + + al = bl; + bl = ah >> c; + + if (bl == 1) + /* (1|b) = 1 */ + return 1 - (bit & 2); + + ah = bh; + + bit ^= al & bl; + + goto b_reduced; + } + if ( (al & 1) == 0) + { + count_trailing_zeros (c, al); + + al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c); + ah >>= c; + bit ^= (c << 1) & (bl ^ (bl >> 1)); + } + if (ah == 0) + { + if (bh > 0) + { + bit ^= al & bl; + MP_LIMB_T_SWAP (al, bl); + ah = bh; + goto b_reduced; + } + goto ab_reduced; + } + + while (bh > 0) + { + /* Compute (a|b) */ + while (ah > bh) + { + sub_ddmmss (ah, al, ah, al, bh, bl); + if (al == 0) + { + count_trailing_zeros (c, ah); + bit ^= ((GMP_NUMB_BITS + c) << 1) & (bl ^ (bl >> 1)); + + al = bl; + bl = ah >> c; + ah = bh; + + bit ^= al & bl; + goto b_reduced; + } + count_trailing_zeros (c, al); + bit ^= (c << 1) & (bl ^ (bl >> 1)); + al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c); + ah >>= c; + } + if (ah == bh) + goto cancel_hi; + + if (ah == 0) + { + bit ^= al & bl; + MP_LIMB_T_SWAP (al, bl); + ah = bh; + break; + } + + bit ^= al & bl; + + /* Compute (b|a) */ + while (bh > ah) + { + sub_ddmmss (bh, bl, bh, bl, ah, al); + if (bl == 0) + { + count_trailing_zeros (c, bh); + bit ^= ((GMP_NUMB_BITS + c) << 1) & (al ^ (al >> 1)); + + bl = bh >> c; + bit ^= al & bl; + goto b_reduced; + } + count_trailing_zeros (c, bl); + bit ^= (c << 1) & (al ^ (al >> 1)); + bl = ((bh << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (bl >> c); + bh >>= c; + } + bit ^= al & bl; + + /* Compute (a|b) */ + if (ah == bh) + { + cancel_hi: + if (al < bl) + { + MP_LIMB_T_SWAP (al, bl); + bit ^= al & bl; + } + al -= bl; + if (al == 0) + return 0; + + count_trailing_zeros (c, al); + bit ^= (c << 1) & (bl ^ (bl >> 1)); + al >>= c; + + if (al == 1) + return 1 - (bit & 2); + + MP_LIMB_T_SWAP (al, bl); + bit ^= al & bl; + break; + } + } + + b_reduced: + /* Compute (a|b), with b a single limb. */ + ASSERT (bl & 1); + + if (bl == 1) + /* (a|1) = 1 */ + return 1 - (bit & 2); + + while (ah > 0) + { + ah -= (al < bl); + al -= bl; + if (al == 0) + { + if (ah == 0) + return 0; + count_trailing_zeros (c, ah); + bit ^= ((GMP_NUMB_BITS + c) << 1) & (bl ^ (bl >> 1)); + al = ah >> c; + goto ab_reduced; + } + count_trailing_zeros (c, al); + + al = ((ah << (GMP_NUMB_BITS - c)) & GMP_NUMB_MASK) | (al >> c); + ah >>= c; + bit ^= (c << 1) & (bl ^ (bl >> 1)); + } + ab_reduced: + ASSERT (bl & 1); + ASSERT (bl > 1); + + return mpn_jacobi_base (al, bl, bit); +} +#else +#error Unsupported value for JACOBI_2_METHOD +#endif diff --git a/gmp-6.3.0/mpn/generic/logops_n.c b/gmp-6.3.0/mpn/generic/logops_n.c new file mode 100644 index 0000000..3adba2c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/logops_n.c @@ -0,0 +1,77 @@ +/* mpn_and_n, mpn_ior_n, etc -- mpn logical operations. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#ifdef OPERATION_and_n +#define func __MPN(and_n) +#define call mpn_and_n +#endif + +#ifdef OPERATION_andn_n +#define func __MPN(andn_n) +#define call mpn_andn_n +#endif + +#ifdef OPERATION_nand_n +#define func __MPN(nand_n) +#define call mpn_nand_n +#endif + +#ifdef OPERATION_ior_n +#define func __MPN(ior_n) +#define call mpn_ior_n +#endif + +#ifdef OPERATION_iorn_n +#define func __MPN(iorn_n) +#define call mpn_iorn_n +#endif + +#ifdef OPERATION_nior_n +#define func __MPN(nior_n) +#define call mpn_nior_n +#endif + +#ifdef OPERATION_xor_n +#define func __MPN(xor_n) +#define call mpn_xor_n +#endif + +#ifdef OPERATION_xnor_n +#define func __MPN(xnor_n) +#define call mpn_xnor_n +#endif + +void +func (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + call (rp, up, vp, n); +} diff --git a/gmp-6.3.0/mpn/generic/lshift.c b/gmp-6.3.0/mpn/generic/lshift.c new file mode 100644 index 0000000..7e1fdef --- /dev/null +++ b/gmp-6.3.0/mpn/generic/lshift.c @@ -0,0 +1,72 @@ +/* mpn_lshift -- Shift left low level. + +Copyright 1991, 1993, 1994, 1996, 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Shift U (pointed to by up and n limbs long) cnt bits to the left + and store the n least significant limbs of the result at rp. + Return the bits shifted out from the most significant limb. + + Argument constraints: + 1. 0 < cnt < GMP_NUMB_BITS. + 2. If the result is to be written over the input, rp must be >= up. +*/ + +mp_limb_t +mpn_lshift (mp_ptr rp, mp_srcptr up, mp_size_t n, unsigned int cnt) +{ + mp_limb_t high_limb, low_limb; + unsigned int tnc; + mp_size_t i; + mp_limb_t retval; + + ASSERT (n >= 1); + ASSERT (cnt >= 1); + ASSERT (cnt < GMP_NUMB_BITS); + ASSERT (MPN_SAME_OR_DECR_P (rp, up, n)); + + up += n; + rp += n; + + tnc = GMP_NUMB_BITS - cnt; + low_limb = *--up; + retval = low_limb >> tnc; + high_limb = (low_limb << cnt) & GMP_NUMB_MASK; + + for (i = n - 1; i != 0; i--) + { + low_limb = *--up; + *--rp = high_limb | (low_limb >> tnc); + high_limb = (low_limb << cnt) & GMP_NUMB_MASK; + } + *--rp = high_limb; + + return retval; +} diff --git a/gmp-6.3.0/mpn/generic/lshiftc.c b/gmp-6.3.0/mpn/generic/lshiftc.c new file mode 100644 index 0000000..a583602 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/lshiftc.c @@ -0,0 +1,73 @@ +/* mpn_lshiftc -- Shift left low level with complement. + +Copyright 1991, 1993, 1994, 1996, 2000-2002, 2009 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Shift U (pointed to by up and n limbs long) cnt bits to the left + and store the n least significant limbs of the result at rp. + Return the bits shifted out from the most significant limb. + + Argument constraints: + 1. 0 < cnt < GMP_NUMB_BITS. + 2. If the result is to be written over the input, rp must be >= up. +*/ + +mp_limb_t +mpn_lshiftc (mp_ptr rp, mp_srcptr up, mp_size_t n, unsigned int cnt) +{ + mp_limb_t high_limb, low_limb; + unsigned int tnc; + mp_size_t i; + mp_limb_t retval; + + ASSERT (n >= 1); + ASSERT (cnt >= 1); + ASSERT (cnt < GMP_NUMB_BITS); + ASSERT (MPN_SAME_OR_DECR_P (rp, up, n)); + + up += n; + rp += n; + + tnc = GMP_NUMB_BITS - cnt; + low_limb = *--up; + retval = low_limb >> tnc; + high_limb = (low_limb << cnt); + + for (i = n - 1; i != 0; i--) + { + low_limb = *--up; + *--rp = (~(high_limb | (low_limb >> tnc))) & GMP_NUMB_MASK; + high_limb = low_limb << cnt; + } + *--rp = (~high_limb) & GMP_NUMB_MASK; + + return retval; +} diff --git a/gmp-6.3.0/mpn/generic/matrix22_mul.c b/gmp-6.3.0/mpn/generic/matrix22_mul.c new file mode 100644 index 0000000..6a1299a --- /dev/null +++ b/gmp-6.3.0/mpn/generic/matrix22_mul.c @@ -0,0 +1,321 @@ +/* matrix22_mul.c. + + Contributed by Niels Möller and Marco Bodrato. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2003-2005, 2008, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#define MUL(rp, ap, an, bp, bn) do { \ + if (an >= bn) \ + mpn_mul (rp, ap, an, bp, bn); \ + else \ + mpn_mul (rp, bp, bn, ap, an); \ +} while (0) + +/* Inputs are unsigned. */ +static int +abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) +{ + int c; + MPN_CMP (c, ap, bp, n); + if (c >= 0) + { + mpn_sub_n (rp, ap, bp, n); + return 0; + } + else + { + mpn_sub_n (rp, bp, ap, n); + return 1; + } +} + +static int +add_signed_n (mp_ptr rp, + mp_srcptr ap, int as, mp_srcptr bp, int bs, mp_size_t n) +{ + if (as != bs) + return as ^ abs_sub_n (rp, ap, bp, n); + else + { + ASSERT_NOCARRY (mpn_add_n (rp, ap, bp, n)); + return as; + } +} + +mp_size_t +mpn_matrix22_mul_itch (mp_size_t rn, mp_size_t mn) +{ + if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD) + || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD)) + return 3*rn + 2*mn; + else + return 3*(rn + mn) + 5; +} + +/* Algorithm: + + / s0 \ / 1 0 0 0 \ / r0 \ + | s1 | | 0 1 0 1 | | r1 | + | s2 | | 0 0 -1 1 | | r2 | + | s3 | = | 0 1 -1 1 | \ r3 / + | s4 | | -1 1 -1 1 | + | s5 | | 0 1 0 0 | + \ s6 / \ 0 0 1 0 / + + / t0 \ / 1 0 0 0 \ / m0 \ + | t1 | | 0 1 0 1 | | m1 | + | t2 | | 0 0 -1 1 | | m2 | + | t3 | = | 0 1 -1 1 | \ m3 / + | t4 | | -1 1 -1 1 | + | t5 | | 0 1 0 0 | + \ t6 / \ 0 0 1 0 / + + Note: the two matrices above are the same, but s_i and t_i are used + in the same product, only for i<4, see "A Strassen-like Matrix + Multiplication suited for squaring and higher power computation" by + M. Bodrato, in Proceedings of ISSAC 2010. + + / r0 \ / 1 0 0 0 0 1 0 \ / s0*t0 \ + | r1 | = | 0 0 -1 1 -1 1 0 | | s1*t1 | + | r2 | | 0 1 0 -1 0 -1 -1 | | s2*t2 | + \ r3 / \ 0 1 1 -1 0 -1 0 / | s3*t3 | + | s4*t5 | + | s5*t6 | + \ s6*t4 / + + The scheduling uses two temporaries U0 and U1 to store products, and + two, S0 and T0, to store combinations of entries of the two + operands. +*/ + +/* Computes R = R * M. Elements are numbers R = (r0, r1; r2, r3). + * + * Resulting elements are of size up to rn + mn + 1. + * + * Temporary storage: 3 rn + 3 mn + 5. */ +static void +mpn_matrix22_mul_strassen (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn, + mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn, + mp_ptr tp) +{ + mp_ptr s0, t0, u0, u1; + int r1s, r3s, s0s, t0s, u1s; + s0 = tp; tp += rn + 1; + t0 = tp; tp += mn + 1; + u0 = tp; tp += rn + mn + 1; + u1 = tp; /* rn + mn + 2 */ + + MUL (u0, r1, rn, m2, mn); /* u5 = s5 * t6 */ + r3s = abs_sub_n (r3, r3, r2, rn); /* r3 - r2 */ + if (r3s) + { + r1s = abs_sub_n (r1, r1, r3, rn); + r1[rn] = 0; + } + else + { + r1[rn] = mpn_add_n (r1, r1, r3, rn); + r1s = 0; /* r1 - r2 + r3 */ + } + if (r1s) + { + s0[rn] = mpn_add_n (s0, r1, r0, rn); + s0s = 0; + } + else if (r1[rn] != 0) + { + s0[rn] = r1[rn] - mpn_sub_n (s0, r1, r0, rn); + s0s = 1; /* s4 = -r0 + r1 - r2 + r3 */ + /* Reverse sign! */ + } + else + { + s0s = abs_sub_n (s0, r0, r1, rn); + s0[rn] = 0; + } + MUL (u1, r0, rn, m0, mn); /* u0 = s0 * t0 */ + r0[rn+mn] = mpn_add_n (r0, u0, u1, rn + mn); + ASSERT (r0[rn+mn] < 2); /* u0 + u5 */ + + t0s = abs_sub_n (t0, m3, m2, mn); + u1s = r3s^t0s^1; /* Reverse sign! */ + MUL (u1, r3, rn, t0, mn); /* u2 = s2 * t2 */ + u1[rn+mn] = 0; + if (t0s) + { + t0s = abs_sub_n (t0, m1, t0, mn); + t0[mn] = 0; + } + else + { + t0[mn] = mpn_add_n (t0, t0, m1, mn); + } + + /* FIXME: Could be simplified if we had space for rn + mn + 2 limbs + at r3. I'd expect that for matrices of random size, the high + words t0[mn] and r1[rn] are non-zero with a pretty small + probability. If that can be confirmed this should be done as an + unconditional rn x (mn+1) followed by an if (UNLIKELY (r1[rn])) + add_n. */ + if (t0[mn] != 0) + { + MUL (r3, r1, rn, t0, mn + 1); /* u3 = s3 * t3 */ + ASSERT (r1[rn] < 2); + if (r1[rn] != 0) + mpn_add_n (r3 + rn, r3 + rn, t0, mn + 1); + } + else + { + MUL (r3, r1, rn + 1, t0, mn); + } + + ASSERT (r3[rn+mn] < 4); + + u0[rn+mn] = 0; + if (r1s^t0s) + { + r3s = abs_sub_n (r3, u0, r3, rn + mn + 1); + } + else + { + ASSERT_NOCARRY (mpn_add_n (r3, r3, u0, rn + mn + 1)); + r3s = 0; /* u3 + u5 */ + } + + if (t0s) + { + t0[mn] = mpn_add_n (t0, t0, m0, mn); + } + else if (t0[mn] != 0) + { + t0[mn] -= mpn_sub_n (t0, t0, m0, mn); + } + else + { + t0s = abs_sub_n (t0, t0, m0, mn); + } + MUL (u0, r2, rn, t0, mn + 1); /* u6 = s6 * t4 */ + ASSERT (u0[rn+mn] < 2); + if (r1s) + { + ASSERT_NOCARRY (mpn_sub_n (r1, r2, r1, rn)); + } + else + { + r1[rn] += mpn_add_n (r1, r1, r2, rn); + } + rn++; + t0s = add_signed_n (r2, r3, r3s, u0, t0s, rn + mn); + /* u3 + u5 + u6 */ + ASSERT (r2[rn+mn-1] < 4); + r3s = add_signed_n (r3, r3, r3s, u1, u1s, rn + mn); + /* -u2 + u3 + u5 */ + ASSERT (r3[rn+mn-1] < 3); + MUL (u0, s0, rn, m1, mn); /* u4 = s4 * t5 */ + ASSERT (u0[rn+mn-1] < 2); + t0[mn] = mpn_add_n (t0, m3, m1, mn); + MUL (u1, r1, rn, t0, mn + 1); /* u1 = s1 * t1 */ + mn += rn; + ASSERT (u1[mn-1] < 4); + ASSERT (u1[mn] == 0); + ASSERT_NOCARRY (add_signed_n (r1, r3, r3s, u0, s0s, mn)); + /* -u2 + u3 - u4 + u5 */ + ASSERT (r1[mn-1] < 2); + if (r3s) + { + ASSERT_NOCARRY (mpn_add_n (r3, u1, r3, mn)); + } + else + { + ASSERT_NOCARRY (mpn_sub_n (r3, u1, r3, mn)); + /* u1 + u2 - u3 - u5 */ + } + ASSERT (r3[mn-1] < 2); + if (t0s) + { + ASSERT_NOCARRY (mpn_add_n (r2, u1, r2, mn)); + } + else + { + ASSERT_NOCARRY (mpn_sub_n (r2, u1, r2, mn)); + /* u1 - u3 - u5 - u6 */ + } + ASSERT (r2[mn-1] < 2); +} + +void +mpn_matrix22_mul (mp_ptr r0, mp_ptr r1, mp_ptr r2, mp_ptr r3, mp_size_t rn, + mp_srcptr m0, mp_srcptr m1, mp_srcptr m2, mp_srcptr m3, mp_size_t mn, + mp_ptr tp) +{ + if (BELOW_THRESHOLD (rn, MATRIX22_STRASSEN_THRESHOLD) + || BELOW_THRESHOLD (mn, MATRIX22_STRASSEN_THRESHOLD)) + { + mp_ptr p0, p1; + unsigned i; + + /* Temporary storage: 3 rn + 2 mn */ + p0 = tp + rn; + p1 = p0 + rn + mn; + + for (i = 0; i < 2; i++) + { + MPN_COPY (tp, r0, rn); + + if (rn >= mn) + { + mpn_mul (p0, r0, rn, m0, mn); + mpn_mul (p1, r1, rn, m3, mn); + mpn_mul (r0, r1, rn, m2, mn); + mpn_mul (r1, tp, rn, m1, mn); + } + else + { + mpn_mul (p0, m0, mn, r0, rn); + mpn_mul (p1, m3, mn, r1, rn); + mpn_mul (r0, m2, mn, r1, rn); + mpn_mul (r1, m1, mn, tp, rn); + } + r0[rn+mn] = mpn_add_n (r0, r0, p0, rn + mn); + r1[rn+mn] = mpn_add_n (r1, r1, p1, rn + mn); + + r0 = r2; r1 = r3; + } + } + else + mpn_matrix22_mul_strassen (r0, r1, r2, r3, rn, + m0, m1, m2, m3, mn, tp); +} diff --git a/gmp-6.3.0/mpn/generic/matrix22_mul1_inverse_vector.c b/gmp-6.3.0/mpn/generic/matrix22_mul1_inverse_vector.c new file mode 100644 index 0000000..68d50b7 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/matrix22_mul1_inverse_vector.c @@ -0,0 +1,64 @@ +/* matrix22_mul1_inverse_vector.c + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2008, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Sets (r;b) = M^{-1}(a;b), with M^{-1} = (u11, -u01; -u10, u00) from + the left. Uses three buffers, to avoid a copy. */ +mp_size_t +mpn_matrix22_mul1_inverse_vector (const struct hgcd_matrix1 *M, + mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n) +{ + mp_limb_t h0, h1; + + /* Compute (r;b) <-- (u11 a - u01 b; -u10 a + u00 b) as + + r = u11 * a + r -= u01 * b + b *= u00 + b -= u10 * a + */ + + h0 = mpn_mul_1 (rp, ap, n, M->u[1][1]); + h1 = mpn_submul_1 (rp, bp, n, M->u[0][1]); + ASSERT (h0 == h1); + + h0 = mpn_mul_1 (bp, bp, n, M->u[0][0]); + h1 = mpn_submul_1 (bp, ap, n, M->u[1][0]); + ASSERT (h0 == h1); + + n -= (rp[n-1] | bp[n-1]) == 0; + return n; +} diff --git a/gmp-6.3.0/mpn/generic/mod_1.c b/gmp-6.3.0/mpn/generic/mod_1.c new file mode 100644 index 0000000..f737bc2 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mod_1.c @@ -0,0 +1,278 @@ +/* mpn_mod_1(dividend_ptr, dividend_size, divisor_limb) -- + Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB. + Return the single-limb remainder. + There are no constraints on the value of the divisor. + +Copyright 1991, 1993, 1994, 1999, 2000, 2002, 2007-2009, 2012, 2020 +Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* The size where udiv_qrnnd_preinv should be used rather than udiv_qrnnd, + meaning the quotient size where that should happen, the quotient size + being how many udiv divisions will be done. + + The default is to use preinv always, CPUs where this doesn't suit have + tuned thresholds. Note in particular that preinv should certainly be + used if that's the only division available (USE_PREINV_ALWAYS). */ + +#ifndef MOD_1_NORM_THRESHOLD +#define MOD_1_NORM_THRESHOLD 0 +#endif + +#ifndef MOD_1_UNNORM_THRESHOLD +#define MOD_1_UNNORM_THRESHOLD 0 +#endif + +#ifndef MOD_1U_TO_MOD_1_1_THRESHOLD +#define MOD_1U_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* default is not to use mpn_mod_1s */ +#endif + +#ifndef MOD_1N_TO_MOD_1_1_THRESHOLD +#define MOD_1N_TO_MOD_1_1_THRESHOLD MP_SIZE_T_MAX /* default is not to use mpn_mod_1s */ +#endif + +#ifndef MOD_1_1_TO_MOD_1_2_THRESHOLD +#define MOD_1_1_TO_MOD_1_2_THRESHOLD 10 +#endif + +#ifndef MOD_1_2_TO_MOD_1_4_THRESHOLD +#define MOD_1_2_TO_MOD_1_4_THRESHOLD 20 +#endif + +#if TUNE_PROGRAM_BUILD && !HAVE_NATIVE_mpn_mod_1_1p +/* Duplicates declarations in tune/speed.h */ +mp_limb_t mpn_mod_1_1p_1 (mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t [4]); +mp_limb_t mpn_mod_1_1p_2 (mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t [4]); + +void mpn_mod_1_1p_cps_1 (mp_limb_t [4], mp_limb_t); +void mpn_mod_1_1p_cps_2 (mp_limb_t [4], mp_limb_t); + +#undef mpn_mod_1_1p +#define mpn_mod_1_1p(ap, n, b, pre) \ + (mod_1_1p_method == 1 ? mpn_mod_1_1p_1 (ap, n, b, pre) \ + : (mod_1_1p_method == 2 ? mpn_mod_1_1p_2 (ap, n, b, pre) \ + : __gmpn_mod_1_1p (ap, n, b, pre))) + +#undef mpn_mod_1_1p_cps +#define mpn_mod_1_1p_cps(pre, b) \ + (mod_1_1p_method == 1 ? mpn_mod_1_1p_cps_1 (pre, b) \ + : (mod_1_1p_method == 2 ? mpn_mod_1_1p_cps_2 (pre, b) \ + : __gmpn_mod_1_1p_cps (pre, b))) +#endif /* TUNE_PROGRAM_BUILD && !HAVE_NATIVE_mpn_mod_1_1p */ + + +/* The comments in mpn/generic/divrem_1.c apply here too. + + As noted in the algorithms section of the manual, the shifts in the loop + for the unnorm case can be avoided by calculating r = a%(d*2^n), followed + by a final (r*2^n)%(d*2^n). In fact if it happens that a%(d*2^n) can + skip a division where (a*2^n)%(d*2^n) can't then there's the same number + of divide steps, though how often that happens depends on the assumed + distributions of dividend and divisor. In any case this idea is left to + CPU specific implementations to consider. */ + +static mp_limb_t +mpn_mod_1_unnorm (mp_srcptr up, mp_size_t un, mp_limb_t d) +{ + mp_size_t i; + mp_limb_t n1, n0, r; + mp_limb_t dummy; + int cnt; + + ASSERT (un > 0); + ASSERT (d != 0); + + /* Skip a division if high < divisor. Having the test here before + normalizing will still skip as often as possible. */ + r = up[un - 1]; + if (r < d) + { + if (--un == 0) + return r; + } + else + r = 0; + + d <<= GMP_NAIL_BITS; + + /* If udiv_qrnnd doesn't need a normalized divisor, can use the simple + code above. */ + if (! UDIV_NEEDS_NORMALIZATION + && BELOW_THRESHOLD (un, MOD_1_UNNORM_THRESHOLD)) + { + for (i = un - 1; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + udiv_qrnnd (dummy, r, r, n0, d); + r >>= GMP_NAIL_BITS; + } + return r; + } + + count_leading_zeros (cnt, d); + d <<= cnt; + + n1 = up[un - 1] << GMP_NAIL_BITS; + r = (r << cnt) | (n1 >> (GMP_LIMB_BITS - cnt)); + + if (UDIV_NEEDS_NORMALIZATION + && BELOW_THRESHOLD (un, MOD_1_UNNORM_THRESHOLD)) + { + mp_limb_t nshift; + for (i = un - 2; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + nshift = (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt)); + udiv_qrnnd (dummy, r, r, nshift, d); + r >>= GMP_NAIL_BITS; + n1 = n0; + } + udiv_qrnnd (dummy, r, r, n1 << cnt, d); + r >>= GMP_NAIL_BITS; + return r >> cnt; + } + else + { + mp_limb_t inv, nshift; + invert_limb (inv, d); + + for (i = un - 2; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + nshift = (n1 << cnt) | (n0 >> (GMP_NUMB_BITS - cnt)); + udiv_rnnd_preinv (r, r, nshift, d, inv); + r >>= GMP_NAIL_BITS; + n1 = n0; + } + udiv_rnnd_preinv (r, r, n1 << cnt, d, inv); + r >>= GMP_NAIL_BITS; + return r >> cnt; + } +} + +static mp_limb_t +mpn_mod_1_norm (mp_srcptr up, mp_size_t un, mp_limb_t d) +{ + mp_size_t i; + mp_limb_t n0, r; + mp_limb_t dummy; + + ASSERT (un > 0); + + d <<= GMP_NAIL_BITS; + + ASSERT (d & GMP_LIMB_HIGHBIT); + + /* High limb is initial remainder, possibly with one subtract of + d to get r= d) + r -= d; + r >>= GMP_NAIL_BITS; + un--; + if (un == 0) + return r; + + if (BELOW_THRESHOLD (un, MOD_1_NORM_THRESHOLD)) + { + for (i = un - 1; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + udiv_qrnnd (dummy, r, r, n0, d); + r >>= GMP_NAIL_BITS; + } + return r; + } + else + { + mp_limb_t inv; + invert_limb (inv, d); + for (i = un - 1; i >= 0; i--) + { + n0 = up[i] << GMP_NAIL_BITS; + udiv_rnnd_preinv (r, r, n0, d, inv); + r >>= GMP_NAIL_BITS; + } + return r; + } +} + +mp_limb_t +mpn_mod_1 (mp_srcptr ap, mp_size_t n, mp_limb_t b) +{ + ASSERT (n >= 0); + ASSERT (b != 0); + + /* Should this be handled at all? Rely on callers? Note un==0 is currently + required by mpz/fdiv_r_ui.c and possibly other places. */ + if (n == 0) + return 0; + + if (UNLIKELY ((b & GMP_NUMB_HIGHBIT) != 0)) + { + if (BELOW_THRESHOLD (n, MOD_1N_TO_MOD_1_1_THRESHOLD)) + { + return mpn_mod_1_norm (ap, n, b); + } + else + { + mp_limb_t pre[4]; + mpn_mod_1_1p_cps (pre, b); + return mpn_mod_1_1p (ap, n, b, pre); + } + } + else + { + if (BELOW_THRESHOLD (n, MOD_1U_TO_MOD_1_1_THRESHOLD)) + { + return mpn_mod_1_unnorm (ap, n, b); + } + else if (BELOW_THRESHOLD (n, MOD_1_1_TO_MOD_1_2_THRESHOLD)) + { + mp_limb_t pre[4]; + mpn_mod_1_1p_cps (pre, b); + return mpn_mod_1_1p (ap, n, b << pre[1], pre); + } + else if (BELOW_THRESHOLD (n, MOD_1_2_TO_MOD_1_4_THRESHOLD) || UNLIKELY (b > GMP_NUMB_MASK / 4)) + { + mp_limb_t pre[5]; + mpn_mod_1s_2p_cps (pre, b); + return mpn_mod_1s_2p (ap, n, b << pre[1], pre); + } + else + { + mp_limb_t pre[7]; + mpn_mod_1s_4p_cps (pre, b); + return mpn_mod_1s_4p (ap, n, b << pre[1], pre); + } + } +} diff --git a/gmp-6.3.0/mpn/generic/mod_1_1.c b/gmp-6.3.0/mpn/generic/mod_1_1.c new file mode 100644 index 0000000..be199ff --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mod_1_1.c @@ -0,0 +1,341 @@ +/* mpn_mod_1_1p (ap, n, b, cps) + Divide (ap,,n) by b. Return the single-limb remainder. + + Contributed to the GNU project by Torbjorn Granlund and Niels Möller. + Based on a suggestion by Peter L. Montgomery. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2008-2011, 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef MOD_1_1P_METHOD +# define MOD_1_1P_METHOD 1 /* need to make sure this is 2 for asm testing */ +#endif + +/* Define some longlong.h-style macros, but for wider operations. + * add_mssaaaa is like longlong.h's add_ssaaaa, but also generates + * carry out, in the form of a mask. */ + +#if defined (__GNUC__) && ! defined (NO_ASM) + +#if HAVE_HOST_CPU_FAMILY_x86 && W_TYPE_SIZE == 32 +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "add %6, %k2\n\t" \ + "adc %4, %k1\n\t" \ + "sbb %k0, %k0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "1" ((USItype)(a1)), "g" ((USItype)(b1)), \ + "%2" ((USItype)(a0)), "g" ((USItype)(b0))) +#endif + +#if HAVE_HOST_CPU_FAMILY_x86_64 && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "add %6, %q2\n\t" \ + "adc %4, %q1\n\t" \ + "sbb %q0, %q0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "1" ((UDItype)(a1)), "rme" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "rme" ((UDItype)(b0))) +#endif + +#if defined (__sparc__) && W_TYPE_SIZE == 32 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "addcc %r5, %6, %2\n\t" \ + "addxcc %r3, %4, %1\n\t" \ + "subx %%g0, %%g0, %0" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \ + __CLOBBER_CC) +#endif + +#if defined (__sparc__) && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "addcc %r5, %6, %2\n\t" \ + "addccc %r7, %8, %%g0\n\t" \ + "addccc %r3, %4, %1\n\t" \ + "clr %0\n\t" \ + "movcs %%xcc, -1, %0" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \ + "rJ" ((al) >> 32), "rI" ((bl) >> 32) \ + __CLOBBER_CC) +#if __VIS__ >= 0x300 +#undef add_mssaaaa +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "addcc %r5, %6, %2\n\t" \ + "addxccc %r3, %4, %1\n\t" \ + "clr %0\n\t" \ + "movcs %%xcc, -1, %0" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl) \ + __CLOBBER_CC) +#endif +#endif + +#if HAVE_HOST_CPU_FAMILY_powerpc && !defined (_LONG_LONG_LIMB) +/* This works fine for 32-bit and 64-bit limbs, except for 64-bit limbs with a + processor running in 32-bit mode, since the carry flag then gets the 32-bit + carry. */ +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "add%I6c %2, %5, %6\n\t" \ + "adde %1, %3, %4\n\t" \ + "subfe %0, %0, %0\n\t" \ + "nor %0, %0, %0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "r" (a1), "r" (b1), "%r" (a0), "rI" (b0) \ + __CLOBBER_CC) +#endif + +#if defined (__s390x__) && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + __asm__ ( "algr %2, %6\n\t" \ + "alcgr %1, %4\n\t" \ + "lghi %0, 0\n\t" \ + "alcgr %0, %0\n\t" \ + "lcgr %0, %0" \ + : "=r" (m), "=r" (s1), "=&r" (s0) \ + : "1" ((UDItype)(a1)), "r" ((UDItype)(b1)), \ + "%2" ((UDItype)(a0)), "r" ((UDItype)(b0)) __CLOBBER_CC) +#endif + +#if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "adds %2, %5, %6\n\t" \ + "adcs %1, %3, %4\n\t" \ + "movcc %0, #0\n\t" \ + "movcs %0, #-1" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC) +#endif + +#if defined (__aarch64__) && W_TYPE_SIZE == 64 +#define add_mssaaaa(m, sh, sl, ah, al, bh, bl) \ + __asm__ ( "adds %2, %x5, %6\n\t" \ + "adcs %1, %x3, %x4\n\t" \ + "csinv %0, xzr, xzr, cc\n\t" \ + : "=r" (m), "=r" (sh), "=&r" (sl) \ + : "rZ" (ah), "rZ" (bh), "%rZ" (al), "rI" (bl) __CLOBBER_CC) +#endif +#endif /* defined (__GNUC__) */ + +#ifndef add_mssaaaa +#define add_mssaaaa(m, s1, s0, a1, a0, b1, b0) \ + do { \ + UWtype __s0, __s1, __c0, __c1; \ + __s0 = (a0) + (b0); \ + __s1 = (a1) + (b1); \ + __c0 = __s0 < (a0); \ + __c1 = __s1 < (a1); \ + (s0) = __s0; \ + __s1 = __s1 + __c0; \ + (s1) = __s1; \ + (m) = - (__c1 + (__s1 < __c0)); \ + } while (0) +#endif + +#if MOD_1_1P_METHOD == 1 +void +mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b) +{ + mp_limb_t bi; + mp_limb_t B1modb, B2modb; + int cnt; + + count_leading_zeros (cnt, b); + + b <<= cnt; + invert_limb (bi, b); + + cps[0] = bi; + cps[1] = cnt; + + B1modb = -b; + if (LIKELY (cnt != 0)) + B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); + ASSERT (B1modb <= b); /* NB: not fully reduced mod b */ + cps[2] = B1modb >> cnt; + + /* In the normalized case, this can be simplified to + * + * B2modb = - b * bi; + * ASSERT (B2modb <= b); // NB: equality iff b = B/2 + */ + udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi); + cps[3] = B2modb >> cnt; +} + +mp_limb_t +mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4]) +{ + mp_limb_t rh, rl, bi, ph, pl, r; + mp_limb_t B1modb, B2modb; + mp_size_t i; + int cnt; + mp_limb_t mask; + + ASSERT (n >= 2); /* fix tuneup.c if this is changed */ + + B1modb = bmodb[2]; + B2modb = bmodb[3]; + + rl = ap[n - 1]; + umul_ppmm (ph, pl, rl, B1modb); + add_ssaaaa (rh, rl, ph, pl, CNST_LIMB(0), ap[n - 2]); + + for (i = n - 3; i >= 0; i -= 1) + { + /* rr = ap[i] < B + + LO(rr) * (B mod b) <= (B-1)(b-1) + + HI(rr) * (B^2 mod b) <= (B-1)(b-1) + */ + umul_ppmm (ph, pl, rl, B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i]); + + umul_ppmm (rh, rl, rh, B2modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + } + + cnt = bmodb[1]; + bi = bmodb[0]; + + if (LIKELY (cnt != 0)) + rh = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)); + + mask = -(mp_limb_t) (rh >= b); + rh -= mask & b; + + udiv_rnnd_preinv (r, rh, rl << cnt, b, bi); + + return r >> cnt; +} +#endif /* MOD_1_1P_METHOD == 1 */ + +#if MOD_1_1P_METHOD == 2 +void +mpn_mod_1_1p_cps (mp_limb_t cps[4], mp_limb_t b) +{ + mp_limb_t bi; + mp_limb_t B2modb; + int cnt; + + count_leading_zeros (cnt, b); + + b <<= cnt; + invert_limb (bi, b); + + cps[0] = bi; + cps[1] = cnt; + + if (LIKELY (cnt != 0)) + { + mp_limb_t B1modb = -b; + B1modb *= ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); + ASSERT (B1modb <= b); /* NB: not fully reduced mod b */ + cps[2] = B1modb >> cnt; + } + B2modb = - b * bi; + ASSERT (B2modb <= b); // NB: equality iff b = B/2 + cps[3] = B2modb; +} + +mp_limb_t +mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t bmodb[4]) +{ + int cnt; + mp_limb_t bi, B1modb; + mp_limb_t r0, r1; + mp_limb_t r; + + ASSERT (n >= 2); /* fix tuneup.c if this is changed */ + + r0 = ap[n-2]; + r1 = ap[n-1]; + + if (n > 2) + { + mp_limb_t B2modb, B2mb; + mp_limb_t p0, p1; + mp_limb_t r2; + mp_size_t j; + + B2modb = bmodb[3]; + B2mb = B2modb - b; + + umul_ppmm (p1, p0, r1, B2modb); + add_mssaaaa (r2, r1, r0, r0, ap[n-3], p1, p0); + + for (j = n-4; j >= 0; j--) + { + mp_limb_t cy; + /* mp_limb_t t = r0 + B2mb; */ + umul_ppmm (p1, p0, r1, B2modb); + + ADDC_LIMB (cy, r0, r0, r2 & B2modb); + /* Alternative, for cmov: if (cy) r0 = t; */ + r0 -= (-cy) & b; + add_mssaaaa (r2, r1, r0, r0, ap[j], p1, p0); + } + + r1 -= (r2 & b); + } + + cnt = bmodb[1]; + + if (LIKELY (cnt != 0)) + { + mp_limb_t t; + mp_limb_t B1modb = bmodb[2]; + + umul_ppmm (r1, t, r1, B1modb); + r0 += t; + r1 += (r0 < t); + + /* Normalize */ + r1 = (r1 << cnt) | (r0 >> (GMP_LIMB_BITS - cnt)); + r0 <<= cnt; + + /* NOTE: Might get r1 == b here, but udiv_rnnd_preinv allows that. */ + } + else + { + mp_limb_t mask = -(mp_limb_t) (r1 >= b); + r1 -= mask & b; + } + + bi = bmodb[0]; + + udiv_rnnd_preinv (r, r1, r0, b, bi); + return r >> cnt; +} +#endif /* MOD_1_1P_METHOD == 2 */ diff --git a/gmp-6.3.0/mpn/generic/mod_1_2.c b/gmp-6.3.0/mpn/generic/mod_1_2.c new file mode 100644 index 0000000..b00d19e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mod_1_2.c @@ -0,0 +1,148 @@ +/* mpn_mod_1s_2p (ap, n, b, cps) + Divide (ap,,n) by b. Return the single-limb remainder. + Requires that b < B / 2. + + Contributed to the GNU project by Torbjorn Granlund. + Based on a suggestion by Peter L. Montgomery. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2008-2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +void +mpn_mod_1s_2p_cps (mp_limb_t cps[5], mp_limb_t b) +{ + mp_limb_t bi; + mp_limb_t B1modb, B2modb, B3modb; + int cnt; + + ASSERT (b <= (~(mp_limb_t) 0) / 2); + + count_leading_zeros (cnt, b); + + b <<= cnt; + invert_limb (bi, b); + + cps[0] = bi; + cps[1] = cnt; + + B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); + ASSERT (B1modb <= b); /* NB: not fully reduced mod b */ + cps[2] = B1modb >> cnt; + + udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi); + cps[3] = B2modb >> cnt; + + udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi); + cps[4] = B3modb >> cnt; + +#if WANT_ASSERT + { + int i; + b = cps[2]; + for (i = 3; i <= 4; i++) + { + b += cps[i]; + ASSERT (b >= cps[i]); + } + } +#endif +} + +mp_limb_t +mpn_mod_1s_2p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[5]) +{ + mp_limb_t rh, rl, bi, ph, pl, ch, cl, r; + mp_limb_t B1modb, B2modb, B3modb; + mp_size_t i; + int cnt; + + ASSERT (n >= 1); + + B1modb = cps[2]; + B2modb = cps[3]; + B3modb = cps[4]; + + if ((n & 1) != 0) + { + if (n == 1) + { + rl = ap[n - 1]; + bi = cps[0]; + cnt = cps[1]; + udiv_rnnd_preinv (r, rl >> (GMP_LIMB_BITS - cnt), + rl << cnt, b, bi); + return r >> cnt; + } + + umul_ppmm (ph, pl, ap[n - 2], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]); + umul_ppmm (rh, rl, ap[n - 1], B2modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + n--; + } + else + { + rh = ap[n - 1]; + rl = ap[n - 2]; + } + + for (i = n - 4; i >= 0; i -= 2) + { + /* rr = ap[i] < B + + ap[i+1] * (B mod b) <= (B-1)(b-1) + + LO(rr) * (B^2 mod b) <= (B-1)(b-1) + + HI(rr) * (B^3 mod b) <= (B-1)(b-1) + */ + umul_ppmm (ph, pl, ap[i + 1], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]); + + umul_ppmm (ch, cl, rl, B2modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (rh, rl, rh, B3modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + } + + umul_ppmm (rh, cl, rh, B1modb); + add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl); + + cnt = cps[1]; + bi = cps[0]; + + r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)); + udiv_rnnd_preinv (r, r, rl << cnt, b, bi); + + return r >> cnt; +} diff --git a/gmp-6.3.0/mpn/generic/mod_1_3.c b/gmp-6.3.0/mpn/generic/mod_1_3.c new file mode 100644 index 0000000..e4a908d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mod_1_3.c @@ -0,0 +1,155 @@ +/* mpn_mod_1s_3p (ap, n, b, cps) + Divide (ap,,n) by b. Return the single-limb remainder. + Requires that b < B / 3. + + Contributed to the GNU project by Torbjorn Granlund. + Based on a suggestion by Peter L. Montgomery. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2008-2010, 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +void +mpn_mod_1s_3p_cps (mp_limb_t cps[6], mp_limb_t b) +{ + mp_limb_t bi; + mp_limb_t B1modb, B2modb, B3modb, B4modb; + int cnt; + + ASSERT (b <= (~(mp_limb_t) 0) / 3); + + count_leading_zeros (cnt, b); + + b <<= cnt; + invert_limb (bi, b); + + cps[0] = bi; + cps[1] = cnt; + + B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); + ASSERT (B1modb <= b); /* NB: not fully reduced mod b */ + cps[2] = B1modb >> cnt; + + udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi); + cps[3] = B2modb >> cnt; + + udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi); + cps[4] = B3modb >> cnt; + + udiv_rnnd_preinv (B4modb, B3modb, CNST_LIMB(0), b, bi); + cps[5] = B4modb >> cnt; + +#if WANT_ASSERT + { + int i; + b = cps[2]; + for (i = 3; i <= 5; i++) + { + b += cps[i]; + ASSERT (b >= cps[i]); + } + } +#endif +} + +mp_limb_t +mpn_mod_1s_3p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[6]) +{ + mp_limb_t rh, rl, bi, ph, pl, ch, cl, r; + mp_limb_t B1modb, B2modb, B3modb, B4modb; + mp_size_t i; + int cnt; + + ASSERT (n >= 1); + + B1modb = cps[2]; + B2modb = cps[3]; + B3modb = cps[4]; + B4modb = cps[5]; + + /* We compute n mod 3 in a tricky way, which works except for when n is so + close to the maximum size that we don't need to support it. The final + cast to int is a workaround for HP cc. */ + switch ((int) ((mp_limb_t) n * MODLIMB_INVERSE_3 >> (GMP_NUMB_BITS - 2))) + { + case 0: + umul_ppmm (ph, pl, ap[n - 2], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]); + umul_ppmm (rh, rl, ap[n - 1], B2modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + n -= 3; + break; + default: /* n mod 3 = 1; (case 2)*/ + rh = 0; + rl = ap[--n]; + break; + case 1: /* n mod 3 = 2 */ + rh = ap[n - 1]; + rl = ap[n - 2]; + n -= 2; + break; + } + + for (i = n - 3; i >= 0; i -= 3) + { + /* rr = ap[i] < B + + ap[i+1] * (B mod b) <= (B-1)(b-1) + + ap[i+2] * (B^2 mod b) <= (B-1)(b-1) + + LO(rr) * (B^3 mod b) <= (B-1)(b-1) + + HI(rr) * (B^4 mod b) <= (B-1)(b-1) + */ + umul_ppmm (ph, pl, ap[i + 1], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]); + + umul_ppmm (ch, cl, ap[i + 2], B2modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (ch, cl, rl, B3modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (rh, rl, rh, B4modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + } + + umul_ppmm (rh, cl, rh, B1modb); + add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl); + + cnt = cps[1]; + bi = cps[0]; + + r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)); + udiv_rnnd_preinv (r, r, rl << cnt, b, bi); + + return r >> cnt; +} diff --git a/gmp-6.3.0/mpn/generic/mod_1_4.c b/gmp-6.3.0/mpn/generic/mod_1_4.c new file mode 100644 index 0000000..80b42ba --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mod_1_4.c @@ -0,0 +1,170 @@ +/* mpn_mod_1s_4p (ap, n, b, cps) + Divide (ap,,n) by b. Return the single-limb remainder. + Requires that b < B / 4. + + Contributed to the GNU project by Torbjorn Granlund. + Based on a suggestion by Peter L. Montgomery. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2008-2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +void +mpn_mod_1s_4p_cps (mp_limb_t cps[7], mp_limb_t b) +{ + mp_limb_t bi; + mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb; + int cnt; + + ASSERT (b <= (~(mp_limb_t) 0) / 4); + + count_leading_zeros (cnt, b); + + b <<= cnt; + invert_limb (bi, b); + + cps[0] = bi; + cps[1] = cnt; + + B1modb = -b * ((bi >> (GMP_LIMB_BITS-cnt)) | (CNST_LIMB(1) << cnt)); + ASSERT (B1modb <= b); /* NB: not fully reduced mod b */ + cps[2] = B1modb >> cnt; + + udiv_rnnd_preinv (B2modb, B1modb, CNST_LIMB(0), b, bi); + cps[3] = B2modb >> cnt; + + udiv_rnnd_preinv (B3modb, B2modb, CNST_LIMB(0), b, bi); + cps[4] = B3modb >> cnt; + + udiv_rnnd_preinv (B4modb, B3modb, CNST_LIMB(0), b, bi); + cps[5] = B4modb >> cnt; + + udiv_rnnd_preinv (B5modb, B4modb, CNST_LIMB(0), b, bi); + cps[6] = B5modb >> cnt; + +#if WANT_ASSERT + { + int i; + b = cps[2]; + for (i = 3; i <= 6; i++) + { + b += cps[i]; + ASSERT (b >= cps[i]); + } + } +#endif +} + +mp_limb_t +mpn_mod_1s_4p (mp_srcptr ap, mp_size_t n, mp_limb_t b, const mp_limb_t cps[7]) +{ + mp_limb_t rh, rl, bi, ph, pl, ch, cl, r; + mp_limb_t B1modb, B2modb, B3modb, B4modb, B5modb; + mp_size_t i; + int cnt; + + ASSERT (n >= 1); + + B1modb = cps[2]; + B2modb = cps[3]; + B3modb = cps[4]; + B4modb = cps[5]; + B5modb = cps[6]; + + switch (n & 3) + { + case 0: + umul_ppmm (ph, pl, ap[n - 3], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 4]); + umul_ppmm (ch, cl, ap[n - 2], B2modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + umul_ppmm (rh, rl, ap[n - 1], B3modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + n -= 4; + break; + case 1: + rh = 0; + rl = ap[n - 1]; + n -= 1; + break; + case 2: + rh = ap[n - 1]; + rl = ap[n - 2]; + n -= 2; + break; + case 3: + umul_ppmm (ph, pl, ap[n - 2], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[n - 3]); + umul_ppmm (rh, rl, ap[n - 1], B2modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + n -= 3; + break; + } + + for (i = n - 4; i >= 0; i -= 4) + { + /* rr = ap[i] < B + + ap[i+1] * (B mod b) <= (B-1)(b-1) + + ap[i+2] * (B^2 mod b) <= (B-1)(b-1) + + ap[i+3] * (B^3 mod b) <= (B-1)(b-1) + + LO(rr) * (B^4 mod b) <= (B-1)(b-1) + + HI(rr) * (B^5 mod b) <= (B-1)(b-1) + */ + umul_ppmm (ph, pl, ap[i + 1], B1modb); + add_ssaaaa (ph, pl, ph, pl, CNST_LIMB(0), ap[i + 0]); + + umul_ppmm (ch, cl, ap[i + 2], B2modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (ch, cl, ap[i + 3], B3modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (ch, cl, rl, B4modb); + add_ssaaaa (ph, pl, ph, pl, ch, cl); + + umul_ppmm (rh, rl, rh, B5modb); + add_ssaaaa (rh, rl, rh, rl, ph, pl); + } + + umul_ppmm (rh, cl, rh, B1modb); + add_ssaaaa (rh, rl, rh, rl, CNST_LIMB(0), cl); + + cnt = cps[1]; + bi = cps[0]; + + r = (rh << cnt) | (rl >> (GMP_LIMB_BITS - cnt)); + udiv_rnnd_preinv (r, r, rl << cnt, b, bi); + + return r >> cnt; +} diff --git a/gmp-6.3.0/mpn/generic/mod_34lsub1.c b/gmp-6.3.0/mpn/generic/mod_34lsub1.c new file mode 100644 index 0000000..af9c6c6 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mod_34lsub1.c @@ -0,0 +1,128 @@ +/* mpn_mod_34lsub1 -- remainder modulo 2^(GMP_NUMB_BITS*3/4)-1. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + +/* Calculate a remainder from {p,n} divided by 2^(GMP_NUMB_BITS*3/4)-1. + The remainder is not fully reduced, it's any limb value congruent to + {p,n} modulo that divisor. + + This implementation is only correct when GMP_NUMB_BITS is a multiple of + 4. + + FIXME: If GMP_NAIL_BITS is some silly big value during development then + it's possible the carry accumulators c0,c1,c2 could overflow. + + General notes: + + The basic idea is to use a set of N accumulators (N=3 in this case) to + effectively get a remainder mod 2^(GMP_NUMB_BITS*N)-1 followed at the end + by a reduction to GMP_NUMB_BITS*N/M bits (M=4 in this case) for a + remainder mod 2^(GMP_NUMB_BITS*N/M)-1. N and M are chosen to give a good + set of small prime factors in 2^(GMP_NUMB_BITS*N/M)-1. + + N=3 M=4 suits GMP_NUMB_BITS==32 and GMP_NUMB_BITS==64 quite well, giving + a few more primes than a single accumulator N=1 does, and for no extra + cost (assuming the processor has a decent number of registers). + + For strange nailified values of GMP_NUMB_BITS the idea would be to look + for what N and M give good primes. With GMP_NUMB_BITS not a power of 2 + the choices for M may be opened up a bit. But such things are probably + best done in separate code, not grafted on here. */ + +#if GMP_NUMB_BITS % 4 == 0 + +#define B1 (GMP_NUMB_BITS / 4) +#define B2 (B1 * 2) +#define B3 (B1 * 3) + +#define M1 ((CNST_LIMB(1) << B1) - 1) +#define M2 ((CNST_LIMB(1) << B2) - 1) +#define M3 ((CNST_LIMB(1) << B3) - 1) + +#define LOW0(n) ((n) & M3) +#define HIGH0(n) ((n) >> B3) + +#define LOW1(n) (((n) & M2) << B1) +#define HIGH1(n) ((n) >> B2) + +#define LOW2(n) (((n) & M1) << B2) +#define HIGH2(n) ((n) >> B1) + +#define PARTS0(n) (LOW0(n) + HIGH0(n)) +#define PARTS1(n) (LOW1(n) + HIGH1(n)) +#define PARTS2(n) (LOW2(n) + HIGH2(n)) + +#define ADD(c,a,val) \ + do { \ + mp_limb_t new_c; \ + ADDC_LIMB (new_c, a, a, val); \ + (c) += new_c; \ + } while (0) + +mp_limb_t +mpn_mod_34lsub1 (mp_srcptr p, mp_size_t n) +{ + mp_limb_t c0, c1, c2; + mp_limb_t a0, a1, a2; + + ASSERT (n >= 1); + ASSERT (n/3 < GMP_NUMB_MAX); + + a0 = a1 = a2 = 0; + c0 = c1 = c2 = 0; + + while ((n -= 3) >= 0) + { + ADD (c0, a0, p[0]); + ADD (c1, a1, p[1]); + ADD (c2, a2, p[2]); + p += 3; + } + + if (n != -3) + { + ADD (c0, a0, p[0]); + if (n != -2) + ADD (c1, a1, p[1]); + } + + return + PARTS0 (a0) + PARTS1 (a1) + PARTS2 (a2) + + PARTS1 (c0) + PARTS2 (c1) + PARTS0 (c2); +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/mode1o.c b/gmp-6.3.0/mpn/generic/mode1o.c new file mode 100644 index 0000000..9ba0ae1 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mode1o.c @@ -0,0 +1,235 @@ +/* mpn_modexact_1c_odd -- mpn by limb exact division style remainder. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2000-2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* Calculate an r satisfying + + r*B^k + a - c == q*d + + where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1 + (the caller won't know which), and q is the quotient (discarded). d must + be odd, c can be any limb value. + + If c=d then 0<=r<=d. + + This slightly strange function suits the initial Nx1 reduction for GCDs + or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving + -r == a mod d (by passing c=0). For a GCD the factor of -1 on r can be + ignored, or for the Jacobi symbol it can be accounted for. The function + also suits divisibility and congruence testing since if r=0 (or r=d) is + obtained then a==c mod d. + + + r is a bit like the remainder returned by mpn_divexact_by3c, and is the + sort of remainder mpn_divexact_1 might return. Like mpn_divexact_by3c, r + represents a borrow, since effectively quotient limbs are chosen so that + subtracting that multiple of d from src at each step will produce a zero + limb. + + A long calculation can be done piece by piece from low to high by passing + the return value from one part as the carry parameter to the next part. + The effective final k becomes anything between size and size-n, if n + pieces are used. + + + A similar sort of routine could be constructed based on adding multiples + of d at each limb, much like redc in mpz_powm does. Subtracting however + has a small advantage that when subtracting to cancel out l there's never + a borrow into h, whereas using an addition would put a carry into h + depending whether l==0 or l!=0. + + + In terms of efficiency, this function is similar to a mul-by-inverse + mpn_mod_1. Both are essentially two multiplies and are best suited to + CPUs with low latency multipliers (in comparison to a divide instruction + at least.) But modexact has a few less supplementary operations, only + needs low part and high part multiplies, and has fewer working quantities + (helping CPUs with few registers). + + + In the main loop it will be noted that the new carry (call it r) is the + sum of the high product h and any borrow from l=s-c. If c=B-d+1 and hence will + never have h=d-1 and so r=h+borrow <= d-1. + + When c>=d, on the other hand, h=d-1 can certainly occur together with a + borrow, thereby giving only r<=d, as per the function definition above. + + As a design decision it's left to the caller to check for r=d if it might + be passing c>=d. Several applications have c= 1); + ASSERT (d & 1); + ASSERT_MPN (src, size); + ASSERT_LIMB (d); + ASSERT_LIMB (c); + + if (size == 1) + { + s = src[0]; + if (s > c) + { + l = s-c; + h = l % d; + if (h != 0) + h = d - h; + } + else + { + l = c-s; + h = l % d; + } + return h; + } + + + binvert_limb (inverse, d); + dmul = d << GMP_NAIL_BITS; + + i = 0; + do + { + s = src[i]; + SUBC_LIMB (c, l, s, c); + l = (l * inverse) & GMP_NUMB_MASK; + umul_ppmm (h, dummy, l, dmul); + c += h; + } + while (++i < size-1); + + + s = src[i]; + if (s <= d) + { + /* With high<=d the final step can be a subtract and addback. If c==0 + then the addback will restore to l>=0. If c==d then will get l==d + if s==0, but that's ok per the function definition. */ + + l = c - s; + if (c < s) + l += d; + + ret = l; + } + else + { + /* Can't skip a divide, just do the loop code once more. */ + + SUBC_LIMB (c, l, s, c); + l = (l * inverse) & GMP_NUMB_MASK; + umul_ppmm (h, dummy, l, dmul); + c += h; + ret = c; + } + + ASSERT (orig_c < d ? ret < d : ret <= d); + return ret; +} + + + +#if 0 + +/* The following is an alternate form that might shave one cycle on a + superscalar processor since it takes c+=h off the dependent chain, + leaving just a low product, high product, and a subtract. + + This is for CPU specific implementations to consider. A special case for + highs) could become + c=(x==0xFF..FF) too, if that helped. */ + +mp_limb_t +mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h) +{ + mp_limb_t s, x, y, inverse, dummy, dmul, c1, c2; + mp_limb_t c = 0; + mp_size_t i; + + ASSERT (size >= 1); + ASSERT (d & 1); + + binvert_limb (inverse, d); + dmul = d << GMP_NAIL_BITS; + + for (i = 0; i < size; i++) + { + ASSERT (c==0 || c==1); + + s = src[i]; + SUBC_LIMB (c1, x, s, c); + + SUBC_LIMB (c2, y, x, h); + c = c1 + c2; + + y = (y * inverse) & GMP_NUMB_MASK; + umul_ppmm (h, dummy, y, dmul); + } + + h += c; + return h; +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/mu_bdiv_q.c b/gmp-6.3.0/mpn/generic/mu_bdiv_q.c new file mode 100644 index 0000000..0ef3bd8 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mu_bdiv_q.c @@ -0,0 +1,281 @@ +/* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn. + storing the result in {qp,nn}. Overlap allowed between Q and N; all other + overlap disallowed. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2005-2007, 2009, 2010, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* + The idea of the algorithm used herein is to compute a smaller inverted value + than used in the standard Barrett algorithm, and thus save time in the + Newton iterations, and pay just a small price when using the inverted value + for developing quotient bits. This algorithm was presented at ICMS 2006. +*/ + +#include "gmp-impl.h" + + +/* N = {np,nn} + D = {dp,dn} + + Requirements: N >= D + D >= 1 + D odd + dn >= 2 + nn >= 2 + scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn). + + Write quotient to Q = {qp,nn}. + + FIXME: When iterating, perhaps do the small step before loop, not after. + FIXME: Try to avoid the scalar divisions when computing inverse size. + FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In + particular, when dn==in, tp and rp could use the same space. + FIXME: Trim final quotient calculation to qn limbs of precision. +*/ +static void +mpn_mu_bdiv_q_old (mp_ptr qp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr scratch) +{ + mp_size_t qn; + mp_size_t in; + int cy, c0; + mp_size_t tn, wn; + + qn = nn; + + ASSERT (dn >= 2); + ASSERT (qn >= 2); + + if (qn > dn) + { + mp_size_t b; + + /* |_______________________| dividend + |________| divisor */ + +#define ip scratch /* in */ +#define rp (scratch + in) /* dn or rest >= binvert_itch(in) */ +#define tp (scratch + in + dn) /* dn+in or next_size(dn) */ +#define scratch_out (scratch + in + dn + tn) /* mulmod_bnm1_itch(next_size(dn)) */ + + /* Compute an inverse size that is a nice partition of the quotient. */ + b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ + in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ + + /* Some notes on allocation: + + When in = dn, R dies when mpn_mullo returns, if in < dn the low in + limbs of R dies at that point. We could save memory by letting T live + just under R, and let the upper part of T expand into R. These changes + should reduce itch to perhaps 3dn. + */ + + mpn_binvert (ip, dp, in, rp); + + cy = 0; + + MPN_COPY (rp, np, dn); + np += dn; + mpn_mullo_n (qp, rp, ip, in); + qn -= in; + + while (qn > in) + { + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */ + else + { + tn = mpn_mulmod_bnm1_next_size (dn); + mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); + wn = dn + in - tn; /* number of wrapped limbs */ + if (wn > 0) + { + c0 = mpn_sub_n (tp + tn, tp, rp, wn); + mpn_decr_u (tp + wn, c0); + } + } + + qp += in; + if (dn != in) + { + /* Subtract tp[dn-1...in] from partial remainder. */ + cy += mpn_sub_n (rp, rp + in, tp + in, dn - in); + if (cy == 2) + { + mpn_incr_u (tp + dn, 1); + cy = 1; + } + } + /* Subtract tp[dn+in-1...dn] from dividend. */ + cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy); + np += in; + mpn_mullo_n (qp, rp, ip, in); + qn -= in; + } + + /* Generate last qn limbs. + FIXME: It should be possible to limit precision here, since qn is + typically somewhat smaller than dn. No big gains expected. */ + + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[qn+in-1...in] */ + else + { + tn = mpn_mulmod_bnm1_next_size (dn); + mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); + wn = dn + in - tn; /* number of wrapped limbs */ + if (wn > 0) + { + c0 = mpn_sub_n (tp + tn, tp, rp, wn); + mpn_decr_u (tp + wn, c0); + } + } + + qp += in; + if (dn != in) + { + cy += mpn_sub_n (rp, rp + in, tp + in, dn - in); + if (cy == 2) + { + mpn_incr_u (tp + dn, 1); + cy = 1; + } + } + + mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy); + mpn_mullo_n (qp, rp, ip, qn); + +#undef ip +#undef rp +#undef tp +#undef scratch_out + } + else + { + /* |_______________________| dividend + |________________| divisor */ + +#define ip scratch /* in */ +#define tp (scratch + in) /* qn+in or next_size(qn) or rest >= binvert_itch(in) */ +#define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */ + + /* Compute half-sized inverse. */ + in = qn - (qn >> 1); + + mpn_binvert (ip, dp, in, tp); + + mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */ + + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, qn, qp, in); /* mulhigh */ + else + { + tn = mpn_mulmod_bnm1_next_size (qn); + mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out); + wn = qn + in - tn; /* number of wrapped limbs */ + if (wn > 0) + { + c0 = mpn_cmp (tp, np, wn) < 0; + mpn_decr_u (tp + wn, c0); + } + } + + mpn_sub_n (tp, np + in, tp + in, qn - in); + mpn_mullo_n (qp + in, tp, ip, qn - in); /* high qn-in quotient limbs */ + +#undef ip +#undef tp +#undef scratch_out + } +} + +void +mpn_mu_bdiv_q (mp_ptr qp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr scratch) +{ + mpn_mu_bdiv_q_old (qp, np, nn, dp, dn, scratch); + mpn_neg (qp, qp, nn); +} + +mp_size_t +mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn) +{ + mp_size_t qn, in, tn, itch_binvert, itch_out, itches; + mp_size_t b; + + ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD); + + qn = nn; + + if (qn > dn) + { + b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ + in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + { + tn = dn + in; + itch_out = 0; + } + else + { + tn = mpn_mulmod_bnm1_next_size (dn); + itch_out = mpn_mulmod_bnm1_itch (tn, dn, in); + } + itches = dn + tn + itch_out; + } + else + { + in = qn - (qn >> 1); + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + { + tn = qn + in; + itch_out = 0; + } + else + { + tn = mpn_mulmod_bnm1_next_size (qn); + itch_out = mpn_mulmod_bnm1_itch (tn, qn, in); + } + itches = tn + itch_out; + } + + itch_binvert = mpn_binvert_itch (in); + return in + MAX (itches, itch_binvert); +} diff --git a/gmp-6.3.0/mpn/generic/mu_bdiv_qr.c b/gmp-6.3.0/mpn/generic/mu_bdiv_qr.c new file mode 100644 index 0000000..540ad73 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mu_bdiv_qr.c @@ -0,0 +1,312 @@ +/* mpn_mu_bdiv_qr(qp,rp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^qn, + where qn = nn-dn, storing the result in {qp,qn}. Overlap allowed between Q + and N; all other overlap disallowed. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2005-2007, 2009, 2010, 2012, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* + The idea of the algorithm used herein is to compute a smaller inverted value + than used in the standard Barrett algorithm, and thus save time in the + Newton iterations, and pay just a small price when using the inverted value + for developing quotient bits. This algorithm was presented at ICMS 2006. +*/ + +#include "gmp-impl.h" + + +/* N = {np,nn} + D = {dp,dn} + + Requirements: N >= D + D >= 1 + D odd + dn >= 2 + nn >= 2 + scratch space as determined by mpn_mu_bdiv_qr_itch(nn,dn). + + Write quotient to Q = {qp,nn-dn}. + + FIXME: When iterating, perhaps do the small step before loop, not after. + FIXME: Try to avoid the scalar divisions when computing inverse size. + FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In + particular, when dn==in, tp and rp could use the same space. +*/ +static mp_limb_t +mpn_mu_bdiv_qr_old (mp_ptr qp, + mp_ptr rp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr scratch) +{ + mp_size_t qn; + mp_size_t in; + mp_limb_t cy, c0; + mp_size_t tn, wn; + + qn = nn - dn; + + ASSERT (dn >= 2); + ASSERT (qn >= 2); + + if (qn > dn) + { + mp_size_t b; + + /* |_______________________| dividend + |________| divisor */ + +#define ip scratch /* in */ +#define tp (scratch + in) /* dn+in or next_size(dn) or rest >= binvert_itch(in) */ +#define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */ + + /* Compute an inverse size that is a nice partition of the quotient. */ + b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ + in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ + + /* Some notes on allocation: + + When in = dn, R dies when mpn_mullo returns, if in < dn the low in + limbs of R dies at that point. We could save memory by letting T live + just under R, and let the upper part of T expand into R. These changes + should reduce itch to perhaps 3dn. + */ + + mpn_binvert (ip, dp, in, tp); + + MPN_COPY (rp, np, dn); + np += dn; + cy = 0; + + while (qn > in) + { + mpn_mullo_n (qp, rp, ip, in); + + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */ + else + { + tn = mpn_mulmod_bnm1_next_size (dn); + mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); + wn = dn + in - tn; /* number of wrapped limbs */ + if (wn > 0) + { + c0 = mpn_sub_n (tp + tn, tp, rp, wn); + mpn_decr_u (tp + wn, c0); + } + } + + qp += in; + qn -= in; + + if (dn != in) + { + /* Subtract tp[dn-1...in] from partial remainder. */ + cy += mpn_sub_n (rp, rp + in, tp + in, dn - in); + if (cy == 2) + { + mpn_incr_u (tp + dn, 1); + cy = 1; + } + } + /* Subtract tp[dn+in-1...dn] from dividend. */ + cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy); + np += in; + } + + /* Generate last qn limbs. */ + mpn_mullo_n (qp, rp, ip, qn); + + if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, dn, qp, qn); /* mulhi, need tp[qn+in-1...in] */ + else + { + tn = mpn_mulmod_bnm1_next_size (dn); + mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out); + wn = dn + qn - tn; /* number of wrapped limbs */ + if (wn > 0) + { + c0 = mpn_sub_n (tp + tn, tp, rp, wn); + mpn_decr_u (tp + wn, c0); + } + } + + if (dn != qn) + { + cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn); + if (cy == 2) + { + mpn_incr_u (tp + dn, 1); + cy = 1; + } + } + return mpn_sub_nc (rp + dn - qn, np, tp + dn, qn, cy); + +#undef ip +#undef tp +#undef scratch_out + } + else + { + /* |_______________________| dividend + |________________| divisor */ + +#define ip scratch /* in */ +#define tp (scratch + in) /* dn+in or next_size(dn) or rest >= binvert_itch(in) */ +#define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(dn)) */ + + /* Compute half-sized inverse. */ + in = qn - (qn >> 1); + + mpn_binvert (ip, dp, in, tp); + + mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */ + + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, dn, qp, in); /* mulhigh */ + else + { + tn = mpn_mulmod_bnm1_next_size (dn); + mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); + wn = dn + in - tn; /* number of wrapped limbs */ + if (wn > 0) + { + c0 = mpn_sub_n (tp + tn, tp, np, wn); + mpn_decr_u (tp + wn, c0); + } + } + + qp += in; + qn -= in; + + cy = mpn_sub_n (rp, np + in, tp + in, dn); + mpn_mullo_n (qp, rp, ip, qn); /* high qn quotient limbs */ + + if (BELOW_THRESHOLD (qn, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, dn, qp, qn); /* mulhigh */ + else + { + tn = mpn_mulmod_bnm1_next_size (dn); + mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, qn, scratch_out); + wn = dn + qn - tn; /* number of wrapped limbs */ + if (wn > 0) + { + c0 = mpn_sub_n (tp + tn, tp, rp, wn); + mpn_decr_u (tp + wn, c0); + } + } + + cy += mpn_sub_n (rp, rp + qn, tp + qn, dn - qn); + if (cy == 2) + { + mpn_incr_u (tp + dn, 1); + cy = 1; + } + return mpn_sub_nc (rp + dn - qn, np + dn + in, tp + dn, qn, cy); + +#undef ip +#undef tp +#undef scratch_out + } +} + +mp_limb_t +mpn_mu_bdiv_qr (mp_ptr qp, + mp_ptr rp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr scratch) +{ + mp_limb_t cy = mpn_mu_bdiv_qr_old (qp, rp, np, nn, dp, dn, scratch); + + /* R' B^{qn} = U - Q' D + * + * Q = B^{qn} - Q' (assuming Q' != 0) + * + * R B^{qn} = U + Q D = U + B^{qn} D - Q' D + * = B^{qn} D + R' + */ + + if (UNLIKELY (!mpn_neg (qp, qp, nn - dn))) + { + /* Zero quotient. */ + ASSERT (cy == 0); + return 0; + } + else + { + mp_limb_t cy2 = mpn_add_n (rp, rp, dp, dn); + ASSERT (cy2 >= cy); + + return cy2 - cy; + } +} + + +mp_size_t +mpn_mu_bdiv_qr_itch (mp_size_t nn, mp_size_t dn) +{ + mp_size_t qn, in, tn, itch_binvert, itch_out, itches; + mp_size_t b; + + ASSERT_ALWAYS (DC_BDIV_Q_THRESHOLD < MU_BDIV_Q_THRESHOLD); + + qn = nn - dn; + + if (qn > dn) + { + b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ + in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ + } + else + { + in = qn - (qn >> 1); + } + + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + { + tn = dn + in; + itch_out = 0; + } + else + { + tn = mpn_mulmod_bnm1_next_size (dn); + itch_out = mpn_mulmod_bnm1_itch (tn, dn, in); + } + + itch_binvert = mpn_binvert_itch (in); + itches = tn + itch_out; + return in + MAX (itches, itch_binvert); +} diff --git a/gmp-6.3.0/mpn/generic/mu_div_q.c b/gmp-6.3.0/mpn/generic/mu_div_q.c new file mode 100644 index 0000000..44cfb40 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mu_div_q.c @@ -0,0 +1,184 @@ +/* mpn_mu_div_q. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2005-2007, 2009, 2010, 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* + The idea of the algorithm used herein is to compute a smaller inverted value + than used in the standard Barrett algorithm, and thus save time in the + Newton iterations, and pay just a small price when using the inverted value + for developing quotient bits. This algorithm was presented at ICMS 2006. +*/ + +/* + Things to work on: + + 1. This is a rudimentary implementation of mpn_mu_div_q. The algorithm is + probably close to optimal, except when mpn_mu_divappr_q fails. + + 2. We used to fall back to mpn_mu_div_qr when we detect a possible + mpn_mu_divappr_q rounding problem, now we multiply and compare. + Unfortunately, since mpn_mu_divappr_q does not return the partial + remainder, this also doesn't become optimal. A mpn_mu_divappr_qr could + solve that. + + 3. The allocations done here should be made from the scratch area, which + then would need to be amended. +*/ + +#include /* for NULL */ +#include "gmp-impl.h" + + +mp_limb_t +mpn_mu_div_q (mp_ptr qp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr scratch) +{ + mp_ptr tp, rp; + mp_size_t qn; + mp_limb_t cy, qh; + TMP_DECL; + + TMP_MARK; + + qn = nn - dn; + + tp = TMP_BALLOC_LIMBS (qn + 1); + + if (qn >= dn) /* nn >= 2*dn + 1 */ + { + /* |_______________________| dividend + |________| divisor */ + + rp = TMP_BALLOC_LIMBS (nn + 1); + MPN_COPY (rp + 1, np, nn); + rp[0] = 0; + + qh = mpn_cmp (rp + 1 + nn - dn, dp, dn) >= 0; + if (qh != 0) + mpn_sub_n (rp + 1 + nn - dn, rp + 1 + nn - dn, dp, dn); + + cy = mpn_mu_divappr_q (tp, rp, nn + 1, dp, dn, scratch); + + if (UNLIKELY (cy != 0)) + { + /* Since the partial remainder fed to mpn_preinv_mu_divappr_q was + canonically reduced, replace the returned value of B^(qn-dn)+eps + by the largest possible value. */ + mp_size_t i; + for (i = 0; i < qn + 1; i++) + tp[i] = GMP_NUMB_MAX; + } + + /* The max error of mpn_mu_divappr_q is +4. If the low quotient limb is + smaller than the max error, we cannot trust the quotient. */ + if (tp[0] > 4) + { + MPN_COPY (qp, tp + 1, qn); + } + else + { + mp_limb_t cy; + mp_ptr pp; + + pp = rp; + mpn_mul (pp, tp + 1, qn, dp, dn); + + cy = (qh != 0) ? mpn_add_n (pp + qn, pp + qn, dp, dn) : 0; + + if (cy || mpn_cmp (pp, np, nn) > 0) /* At most is wrong by one, no cycle. */ + qh -= mpn_sub_1 (qp, tp + 1, qn, 1); + else /* Same as above */ + MPN_COPY (qp, tp + 1, qn); + } + } + else + { + /* |_______________________| dividend + |________________| divisor */ + + /* FIXME: When nn = 2dn-1, qn becomes dn-1, and the numerator size passed + here becomes 2dn, i.e., more than nn. This shouldn't hurt, since only + the most significant dn-1 limbs will actually be read, but it is not + pretty. */ + + qh = mpn_mu_divappr_q (tp, np + nn - (2 * qn + 2), 2 * qn + 2, + dp + dn - (qn + 1), qn + 1, scratch); + + /* The max error of mpn_mu_divappr_q is +4, but we get an additional + error from the divisor truncation. */ + if (tp[0] > 6) + { + MPN_COPY (qp, tp + 1, qn); + } + else + { + mp_limb_t cy; + + /* FIXME: a shorter product should be enough; we may use already + allocated space... */ + rp = TMP_BALLOC_LIMBS (nn); + mpn_mul (rp, dp, dn, tp + 1, qn); + + cy = (qh != 0) ? mpn_add_n (rp + qn, rp + qn, dp, dn) : 0; + + if (cy || mpn_cmp (rp, np, nn) > 0) /* At most is wrong by one, no cycle. */ + qh -= mpn_sub_1 (qp, tp + 1, qn, 1); + else /* Same as above */ + MPN_COPY (qp, tp + 1, qn); + } + } + + TMP_FREE; + return qh; +} + +mp_size_t +mpn_mu_div_q_itch (mp_size_t nn, mp_size_t dn, int mua_k) +{ + mp_size_t qn; + + qn = nn - dn; + if (qn >= dn) + { + return mpn_mu_divappr_q_itch (nn + 1, dn, mua_k); + } + else + { + return mpn_mu_divappr_q_itch (2 * qn + 2, qn + 1, mua_k); + } +} diff --git a/gmp-6.3.0/mpn/generic/mu_div_qr.c b/gmp-6.3.0/mpn/generic/mu_div_qr.c new file mode 100644 index 0000000..8b9c702 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mu_div_qr.c @@ -0,0 +1,417 @@ +/* mpn_mu_div_qr, mpn_preinv_mu_div_qr. + + Compute Q = floor(N / D) and R = N-QD. N is nn limbs and D is dn limbs and + must be normalized, and Q must be nn-dn limbs. The requirement that Q is + nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us to + let N be unmodified during the operation. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* + The idea of the algorithm used herein is to compute a smaller inverted value + than used in the standard Barrett algorithm, and thus save time in the + Newton iterations, and pay just a small price when using the inverted value + for developing quotient bits. This algorithm was presented at ICMS 2006. +*/ + +/* CAUTION: This code and the code in mu_divappr_q.c should be edited in sync. + + Things to work on: + + * This isn't optimal when the quotient isn't needed, as it might take a lot + of space. The computation is always needed, though, so there is no time to + save with special code. + + * The itch/scratch scheme isn't perhaps such a good idea as it once seemed, + demonstrated by the fact that the mpn_invertappr function's scratch needs + mean that we need to keep a large allocation long after it is needed. + Things are worse as mpn_mul_fft does not accept any scratch parameter, + which means we'll have a large memory hole while in mpn_mul_fft. In + general, a peak scratch need in the beginning of a function isn't + well-handled by the itch/scratch scheme. +*/ + +#ifdef STAT +#undef STAT +#define STAT(x) x +#else +#define STAT(x) +#endif + +#include /* for NULL */ +#include "gmp-impl.h" + + +/* FIXME: The MU_DIV_QR_SKEW_THRESHOLD was not analysed properly. It gives a + speedup according to old measurements, but does the decision mechanism + really make sense? It seem like the quotient between dn and qn might be + what we really should be checking. */ +#ifndef MU_DIV_QR_SKEW_THRESHOLD +#define MU_DIV_QR_SKEW_THRESHOLD 100 +#endif + +#ifdef CHECK /* FIXME: Enable in minithres */ +#undef MU_DIV_QR_SKEW_THRESHOLD +#define MU_DIV_QR_SKEW_THRESHOLD 1 +#endif + + +static mp_limb_t mpn_mu_div_qr2 (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr); +static mp_size_t mpn_mu_div_qr_choose_in (mp_size_t, mp_size_t, int); + + +mp_limb_t +mpn_mu_div_qr (mp_ptr qp, + mp_ptr rp, + mp_srcptr np, + mp_size_t nn, + mp_srcptr dp, + mp_size_t dn, + mp_ptr scratch) +{ + mp_size_t qn; + mp_limb_t cy, qh; + + qn = nn - dn; + if (qn + MU_DIV_QR_SKEW_THRESHOLD < dn) + { + /* |______________|_ign_first__| dividend nn + |_______|_ign_first__| divisor dn + + |______| quotient (prel) qn + + |___________________| quotient * ignored-divisor-part dn-1 + */ + + /* Compute a preliminary quotient and a partial remainder by dividing the + most significant limbs of each operand. */ + qh = mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1), + np + nn - (2 * qn + 1), 2 * qn + 1, + dp + dn - (qn + 1), qn + 1, + scratch); + + /* Multiply the quotient by the divisor limbs ignored above. */ + if (dn - (qn + 1) > qn) + mpn_mul (scratch, dp, dn - (qn + 1), qp, qn); /* prod is dn-1 limbs */ + else + mpn_mul (scratch, qp, qn, dp, dn - (qn + 1)); /* prod is dn-1 limbs */ + + if (qh) + cy = mpn_add_n (scratch + qn, scratch + qn, dp, dn - (qn + 1)); + else + cy = 0; + scratch[dn - 1] = cy; + + cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1)); + cy = mpn_sub_nc (rp + nn - (2 * qn + 1), + rp + nn - (2 * qn + 1), + scratch + nn - (2 * qn + 1), + qn + 1, cy); + if (cy) + { + qh -= mpn_sub_1 (qp, qp, qn, 1); + mpn_add_n (rp, rp, dp, dn); + } + } + else + { + qh = mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch); + } + + return qh; +} + +static mp_limb_t +mpn_mu_div_qr2 (mp_ptr qp, + mp_ptr rp, + mp_srcptr np, + mp_size_t nn, + mp_srcptr dp, + mp_size_t dn, + mp_ptr scratch) +{ + mp_size_t qn, in; + mp_limb_t cy, qh; + mp_ptr ip, tp; + + ASSERT (dn > 1); + + qn = nn - dn; + + /* Compute the inverse size. */ + in = mpn_mu_div_qr_choose_in (qn, dn, 0); + ASSERT (in <= dn); + +#if 1 + /* This alternative inverse computation method gets slightly more accurate + results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function + not adapted (3) mpn_invertappr scratch needs not met. */ + ip = scratch; + tp = scratch + in + 1; + + /* compute an approximate inverse on (in+1) limbs */ + if (dn == in) + { + MPN_COPY (tp + 1, dp, in); + tp[0] = 1; + mpn_invertappr (ip, tp, in + 1, tp + in + 1); + MPN_COPY_INCR (ip, ip + 1, in); + } + else + { + cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1); + if (UNLIKELY (cy != 0)) + MPN_ZERO (ip, in); + else + { + mpn_invertappr (ip, tp, in + 1, tp + in + 1); + MPN_COPY_INCR (ip, ip + 1, in); + } + } +#else + /* This older inverse computation method gets slightly worse results than the + one above. */ + ip = scratch; + tp = scratch + in; + + /* Compute inverse of D to in+1 limbs, then round to 'in' limbs. Ideally the + inversion function should do this automatically. */ + if (dn == in) + { + tp[in + 1] = 0; + MPN_COPY (tp + in + 2, dp, in); + mpn_invertappr (tp, tp + in + 1, in + 1, NULL); + } + else + { + mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL); + } + cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT); + if (UNLIKELY (cy != 0)) + MPN_ZERO (tp + 1, in); + MPN_COPY (ip, tp + 1, in); +#endif + + qh = mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in); + + return qh; +} + +mp_limb_t +mpn_preinv_mu_div_qr (mp_ptr qp, + mp_ptr rp, + mp_srcptr np, + mp_size_t nn, + mp_srcptr dp, + mp_size_t dn, + mp_srcptr ip, + mp_size_t in, + mp_ptr scratch) +{ + mp_size_t qn; + mp_limb_t cy, cx, qh; + mp_limb_t r; + mp_size_t tn, wn; + +#define tp scratch +#define scratch_out (scratch + tn) + + qn = nn - dn; + + np += qn; + qp += qn; + + qh = mpn_cmp (np, dp, dn) >= 0; + if (qh != 0) + mpn_sub_n (rp, np, dp, dn); + else + MPN_COPY_INCR (rp, np, dn); + + /* if (qn == 0) */ /* The while below handles this case */ + /* return qh; */ /* Degenerate use. Should we allow this? */ + + while (qn > 0) + { + if (qn < in) + { + ip += in - qn; + in = qn; + } + np -= in; + qp -= in; + + /* Compute the next block of quotient limbs by multiplying the inverse I + by the upper part of the partial remainder R. */ + mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */ + cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */ + ASSERT_ALWAYS (cy == 0); + + qn -= in; + + /* Compute the product of the quotient block and the divisor D, to be + subtracted from the partial remainder combined with new limbs from the + dividend N. We only really need the low dn+1 limbs. */ + + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */ + else + { + tn = mpn_mulmod_bnm1_next_size (dn + 1); + mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); + wn = dn + in - tn; /* number of wrapped limbs */ + if (wn > 0) + { + cy = mpn_sub_n (tp, tp, rp + dn - wn, wn); + cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy); + cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0; + ASSERT_ALWAYS (cx >= cy); + mpn_incr_u (tp, cx - cy); + } + } + + r = rp[dn - in] - tp[dn]; + + /* Subtract the product from the partial remainder combined with new + limbs from the dividend N, generating a new partial remainder R. */ + if (dn != in) + { + cy = mpn_sub_n (tp, np, tp, in); /* get next 'in' limbs from N */ + cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy); + MPN_COPY (rp, tp, dn); /* FIXME: try to avoid this */ + } + else + { + cy = mpn_sub_n (rp, np, tp, in); /* get next 'in' limbs from N */ + } + + STAT (int i; int err = 0; + static int errarr[5]; static int err_rec; static int tot); + + /* Check the remainder R and adjust the quotient as needed. */ + r -= cy; + while (r != 0) + { + /* We loop 0 times with about 69% probability, 1 time with about 31% + probability, 2 times with about 0.6% probability, if inverse is + computed as recommended. */ + mpn_incr_u (qp, 1); + cy = mpn_sub_n (rp, rp, dp, dn); + r -= cy; + STAT (err++); + } + if (mpn_cmp (rp, dp, dn) >= 0) + { + /* This is executed with about 76% probability. */ + mpn_incr_u (qp, 1); + cy = mpn_sub_n (rp, rp, dp, dn); + STAT (err++); + } + + STAT ( + tot++; + errarr[err]++; + if (err > err_rec) + err_rec = err; + if (tot % 0x10000 == 0) + { + for (i = 0; i <= err_rec; i++) + printf (" %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot); + printf ("\n"); + } + ); + } + + return qh; +} + +/* In case k=0 (automatic choice), we distinguish 3 cases: + (a) dn < qn: in = ceil(qn / ceil(qn/dn)) + (b) dn/3 < qn <= dn: in = ceil(qn / 2) + (c) qn < dn/3: in = qn + In all cases we have in <= dn. + */ +static mp_size_t +mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k) +{ + mp_size_t in; + + if (k == 0) + { + mp_size_t b; + if (qn > dn) + { + /* Compute an inverse size that is a nice partition of the quotient. */ + b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ + in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ + } + else if (3 * qn > dn) + { + in = (qn - 1) / 2 + 1; /* b = 2 */ + } + else + { + in = (qn - 1) / 1 + 1; /* b = 1 */ + } + } + else + { + mp_size_t xn; + xn = MIN (dn, qn); + in = (xn - 1) / k + 1; + } + + return in; +} + +mp_size_t +mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k) +{ + mp_size_t in = mpn_mu_div_qr_choose_in (nn - dn, dn, mua_k); + mp_size_t itch_preinv = mpn_preinv_mu_div_qr_itch (nn, dn, in); + mp_size_t itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */ + + ASSERT (itch_preinv >= itch_invapp); + return in + MAX (itch_invapp, itch_preinv); +} + +mp_size_t +mpn_preinv_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, mp_size_t in) +{ + mp_size_t itch_local = mpn_mulmod_bnm1_next_size (dn + 1); + mp_size_t itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in); + + return itch_local + itch_out; +} diff --git a/gmp-6.3.0/mpn/generic/mu_divappr_q.c b/gmp-6.3.0/mpn/generic/mu_divappr_q.c new file mode 100644 index 0000000..0ef7e03 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mu_divappr_q.c @@ -0,0 +1,368 @@ +/* mpn_mu_divappr_q, mpn_preinv_mu_divappr_q. + + Compute Q = floor(N / D) + e. N is nn limbs, D is dn limbs and must be + normalized, and Q must be nn-dn limbs, 0 <= e <= 4. The requirement that Q + is nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us + to let N be unmodified during the operation. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2005-2007, 2009, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* + The idea of the algorithm used herein is to compute a smaller inverted value + than used in the standard Barrett algorithm, and thus save time in the + Newton iterations, and pay just a small price when using the inverted value + for developing quotient bits. This algorithm was presented at ICMS 2006. +*/ + +/* CAUTION: This code and the code in mu_div_qr.c should be edited in sync. + + Things to work on: + + * The itch/scratch scheme isn't perhaps such a good idea as it once seemed, + demonstrated by the fact that the mpn_invertappr function's scratch needs + mean that we need to keep a large allocation long after it is needed. + Things are worse as mpn_mul_fft does not accept any scratch parameter, + which means we'll have a large memory hole while in mpn_mul_fft. In + general, a peak scratch need in the beginning of a function isn't + well-handled by the itch/scratch scheme. +*/ + +#ifdef STAT +#undef STAT +#define STAT(x) x +#else +#define STAT(x) +#endif + +#include /* for NULL */ +#include "gmp-impl.h" + +static mp_limb_t mpn_preinv_mu_divappr_q (mp_ptr, mp_srcptr, mp_size_t, + mp_srcptr, mp_size_t, mp_srcptr, mp_size_t, mp_ptr); +static mp_size_t mpn_mu_divappr_q_choose_in (mp_size_t, mp_size_t, int); + +mp_limb_t +mpn_mu_divappr_q (mp_ptr qp, + mp_srcptr np, + mp_size_t nn, + mp_srcptr dp, + mp_size_t dn, + mp_ptr scratch) +{ + mp_size_t qn, in; + mp_limb_t cy, qh; + mp_ptr ip, tp; + + ASSERT (dn > 1); + + qn = nn - dn; + + /* If Q is smaller than D, truncate operands. */ + if (qn + 1 < dn) + { + np += dn - (qn + 1); + nn -= dn - (qn + 1); + dp += dn - (qn + 1); + dn = qn + 1; + } + + /* Compute the inverse size. */ + in = mpn_mu_divappr_q_choose_in (qn, dn, 0); + ASSERT (in <= dn); + +#if 1 + /* This alternative inverse computation method gets slightly more accurate + results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function + not adapted (3) mpn_invertappr scratch needs not met. */ + ip = scratch; + tp = scratch + in + 1; + + /* compute an approximate inverse on (in+1) limbs */ + if (dn == in) + { + MPN_COPY (tp + 1, dp, in); + tp[0] = 1; + mpn_invertappr (ip, tp, in + 1, tp + in + 1); + MPN_COPY_INCR (ip, ip + 1, in); + } + else + { + cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1); + if (UNLIKELY (cy != 0)) + MPN_ZERO (ip, in); + else + { + mpn_invertappr (ip, tp, in + 1, tp + in + 1); + MPN_COPY_INCR (ip, ip + 1, in); + } + } +#else + /* This older inverse computation method gets slightly worse results than the + one above. */ + ip = scratch; + tp = scratch + in; + + /* Compute inverse of D to in+1 limbs, then round to 'in' limbs. Ideally the + inversion function should do this automatically. */ + if (dn == in) + { + tp[in + 1] = 0; + MPN_COPY (tp + in + 2, dp, in); + mpn_invertappr (tp, tp + in + 1, in + 1, NULL); + } + else + { + mpn_invertappr (tp, dp + dn - (in + 1), in + 1, NULL); + } + cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT); + if (UNLIKELY (cy != 0)) + MPN_ZERO (tp + 1, in); + MPN_COPY (ip, tp + 1, in); +#endif + + qh = mpn_preinv_mu_divappr_q (qp, np, nn, dp, dn, ip, in, scratch + in); + + return qh; +} + +static mp_limb_t +mpn_preinv_mu_divappr_q (mp_ptr qp, + mp_srcptr np, + mp_size_t nn, + mp_srcptr dp, + mp_size_t dn, + mp_srcptr ip, + mp_size_t in, + mp_ptr scratch) +{ + mp_size_t qn; + mp_limb_t cy, cx, qh; + mp_limb_t r; + mp_size_t tn, wn; + +#define rp scratch +#define tp (scratch + dn) +#define scratch_out (scratch + dn + tn) + + qn = nn - dn; + + np += qn; + qp += qn; + + qh = mpn_cmp (np, dp, dn) >= 0; + if (qh != 0) + mpn_sub_n (rp, np, dp, dn); + else + MPN_COPY (rp, np, dn); + + if (UNLIKELY (qn == 0)) + return qh; /* Degenerate use. Should we allow this? */ + + for (;;) /* The exit condition (qn == 0) is verified in the loop. */ + { + if (qn < in) + { + ip += in - qn; + in = qn; + } + np -= in; + qp -= in; + + /* Compute the next block of quotient limbs by multiplying the inverse I + by the upper part of the partial remainder R. */ + mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */ + cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */ + ASSERT_ALWAYS (cy == 0); + + qn -= in; + if (qn == 0) + break; + + /* Compute the product of the quotient block and the divisor D, to be + subtracted from the partial remainder combined with new limbs from the + dividend N. We only really need the low dn limbs. */ + + if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD)) + mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */ + else + { + tn = mpn_mulmod_bnm1_next_size (dn + 1); + mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out); + wn = dn + in - tn; /* number of wrapped limbs */ + if (wn > 0) + { + cy = mpn_sub_n (tp, tp, rp + dn - wn, wn); + cy = mpn_sub_1 (tp + wn, tp + wn, tn - wn, cy); + cx = mpn_cmp (rp + dn - in, tp + dn, tn - dn) < 0; + ASSERT_ALWAYS (cx >= cy); + mpn_incr_u (tp, cx - cy); + } + } + + r = rp[dn - in] - tp[dn]; + + /* Subtract the product from the partial remainder combined with new + limbs from the dividend N, generating a new partial remainder R. */ + if (dn != in) + { + cy = mpn_sub_n (tp, np, tp, in); /* get next 'in' limbs from N */ + cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy); + MPN_COPY (rp, tp, dn); /* FIXME: try to avoid this */ + } + else + { + cy = mpn_sub_n (rp, np, tp, in); /* get next 'in' limbs from N */ + } + + STAT (int i; int err = 0; + static int errarr[5]; static int err_rec; static int tot); + + /* Check the remainder R and adjust the quotient as needed. */ + r -= cy; + while (r != 0) + { + /* We loop 0 times with about 69% probability, 1 time with about 31% + probability, 2 times with about 0.6% probability, if inverse is + computed as recommended. */ + mpn_incr_u (qp, 1); + cy = mpn_sub_n (rp, rp, dp, dn); + r -= cy; + STAT (err++); + } + if (mpn_cmp (rp, dp, dn) >= 0) + { + /* This is executed with about 76% probability. */ + mpn_incr_u (qp, 1); + cy = mpn_sub_n (rp, rp, dp, dn); + STAT (err++); + } + + STAT ( + tot++; + errarr[err]++; + if (err > err_rec) + err_rec = err; + if (tot % 0x10000 == 0) + { + for (i = 0; i <= err_rec; i++) + printf (" %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot); + printf ("\n"); + } + ); + } + + /* FIXME: We should perhaps be somewhat more elegant in our rounding of the + quotient. For now, just make sure the returned quotient is >= the real + quotient; add 3 with saturating arithmetic. */ + qn = nn - dn; + cy += mpn_add_1 (qp, qp, qn, 3); + if (cy != 0) + { + if (qh != 0) + { + /* Return a quotient of just 1-bits, with qh set. */ + mp_size_t i; + for (i = 0; i < qn; i++) + qp[i] = GMP_NUMB_MAX; + } + else + { + /* Propagate carry into qh. */ + qh = 1; + } + } + + return qh; +} + +/* In case k=0 (automatic choice), we distinguish 3 cases: + (a) dn < qn: in = ceil(qn / ceil(qn/dn)) + (b) dn/3 < qn <= dn: in = ceil(qn / 2) + (c) qn < dn/3: in = qn + In all cases we have in <= dn. + */ +static mp_size_t +mpn_mu_divappr_q_choose_in (mp_size_t qn, mp_size_t dn, int k) +{ + mp_size_t in; + + if (k == 0) + { + mp_size_t b; + if (qn > dn) + { + /* Compute an inverse size that is a nice partition of the quotient. */ + b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ + in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ + } + else if (3 * qn > dn) + { + in = (qn - 1) / 2 + 1; /* b = 2 */ + } + else + { + in = (qn - 1) / 1 + 1; /* b = 1 */ + } + } + else + { + mp_size_t xn; + xn = MIN (dn, qn); + in = (xn - 1) / k + 1; + } + + return in; +} + +mp_size_t +mpn_mu_divappr_q_itch (mp_size_t nn, mp_size_t dn, int mua_k) +{ + mp_size_t qn, in, itch_local, itch_out, itch_invapp; + + qn = nn - dn; + if (qn + 1 < dn) + { + dn = qn + 1; + } + in = mpn_mu_divappr_q_choose_in (qn, dn, mua_k); + + itch_local = mpn_mulmod_bnm1_next_size (dn + 1); + itch_out = mpn_mulmod_bnm1_itch (itch_local, dn, in); + itch_invapp = mpn_invertappr_itch (in + 1) + in + 2; /* 3in + 4 */ + + ASSERT (dn + itch_local + itch_out >= itch_invapp); + return in + MAX (dn + itch_local + itch_out, itch_invapp); +} diff --git a/gmp-6.3.0/mpn/generic/mul.c b/gmp-6.3.0/mpn/generic/mul.c new file mode 100644 index 0000000..37444e9 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mul.c @@ -0,0 +1,441 @@ +/* mpn_mul -- Multiply two natural numbers. + + Contributed to the GNU project by Torbjorn Granlund. + +Copyright 1991, 1993, 1994, 1996, 1997, 1999-2003, 2005-2007, 2009, 2010, 2012, +2014, 2019 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +#ifndef MUL_BASECASE_MAX_UN +#define MUL_BASECASE_MAX_UN 500 +#endif + +/* Areas where the different toom algorithms can be called (extracted + from the t-toom*.c files, and ignoring small constant offsets): + + 1/6 1/5 1/4 4/13 1/3 3/8 2/5 5/11 1/2 3/5 2/3 3/4 4/5 1 vn/un + 4/7 6/7 + 6/11 + |--------------------| toom22 (small) + || toom22 (large) + |xxxx| toom22 called + |-------------------------------------| toom32 + |xxxxxxxxxxxxxxxx| | toom32 called + |------------| toom33 + |x| toom33 called + |---------------------------------| | toom42 + |xxxxxxxxxxxxxxxxxxxxxxxx| | toom42 called + |--------------------| toom43 + |xxxxxxxxxx| toom43 called + |-----------------------------| toom52 (unused) + |--------| toom44 + |xxxxxxxx| toom44 called + |--------------------| | toom53 + |xxxxxx| toom53 called + |-------------------------| toom62 (unused) + |----------------| toom54 (unused) + |--------------------| toom63 + |xxxxxxxxx| | toom63 called + |---------------------------------| toom6h + |xxxxxxxx| toom6h called + |-------------------------| toom8h (32 bit) + |------------------------------------------| toom8h (64 bit) + |xxxxxxxx| toom8h called +*/ + +#define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn) +#define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn) + +/* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v + (pointed to by VP, with VN limbs), and store the result at PRODP. The + result is UN + VN limbs. Return the most significant limb of the result. + + NOTE: The space pointed to by PRODP is overwritten before finished with U + and V, so overlap is an error. + + Argument constraints: + 1. UN >= VN. + 2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from + the multiplier and the multiplicand. */ + +/* + * The cutoff lines in the toomX2 and toomX3 code are now exactly between the + ideal lines of the surrounding algorithms. Is that optimal? + + * The toomX3 code now uses a structure similar to the one of toomX2, except + that it loops longer in the unbalanced case. The result is that the + remaining area might have un < vn. Should we fix the toomX2 code in a + similar way? + + * The toomX3 code is used for the largest non-FFT unbalanced operands. It + therefore calls mpn_mul recursively for certain cases. + + * Allocate static temp space using THRESHOLD variables (except for toom44 + when !WANT_FFT). That way, we can typically have no TMP_ALLOC at all. + + * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42 + have the same vn threshold. This is not true, we should actually use + mul_basecase for slightly larger operands for toom32 than for toom22, and + even larger for toom42. + + * That problem is even more prevalent for toomX3. We therefore use special + THRESHOLD variables there. +*/ + +mp_limb_t +mpn_mul (mp_ptr prodp, + mp_srcptr up, mp_size_t un, + mp_srcptr vp, mp_size_t vn) +{ + ASSERT (un >= vn); + ASSERT (vn >= 1); + ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un)); + ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn)); + + if (BELOW_THRESHOLD (un, MUL_TOOM22_THRESHOLD)) + { + /* When un (and thus vn) is below the toom22 range, do mul_basecase. + Test un and not vn here not to thwart the un >> vn code below. + This special case is not necessary, but cuts the overhead for the + smallest operands. */ + mpn_mul_basecase (prodp, up, un, vp, vn); + } + else if (un == vn) + { + mpn_mul_n (prodp, up, vp, un); + } + else if (vn < MUL_TOOM22_THRESHOLD) + { /* plain schoolbook multiplication */ + + /* Unless un is very large, or else if have an applicable mpn_mul_N, + perform basecase multiply directly. */ + if (un <= MUL_BASECASE_MAX_UN +#if HAVE_NATIVE_mpn_mul_2 + || vn <= 2 +#else + || vn == 1 +#endif + ) + mpn_mul_basecase (prodp, up, un, vp, vn); + else + { + /* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory + locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply + these pieces with the vp[] operand. After each such partial + multiplication (but the last) we copy the most significant vn + limbs into a temporary buffer since that part would otherwise be + overwritten by the next multiplication. After the next + multiplication, we add it back. This illustrates the situation: + + -->vn<-- + | |<------- un ------->| + _____________________| + X /| + /XX__________________/ | + _____________________ | + X / | + /XX__________________/ | + _____________________ | + / / | + /____________________/ | + ================================================================== + + The parts marked with X are the parts whose sums are copied into + the temporary buffer. */ + + mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT]; + mp_limb_t cy; + ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT); + + mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); + prodp += MUL_BASECASE_MAX_UN; + MPN_COPY (tp, prodp, vn); /* preserve high triangle */ + up += MUL_BASECASE_MAX_UN; + un -= MUL_BASECASE_MAX_UN; + while (un > MUL_BASECASE_MAX_UN) + { + mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); + cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ + mpn_incr_u (prodp + vn, cy); + prodp += MUL_BASECASE_MAX_UN; + MPN_COPY (tp, prodp, vn); /* preserve high triangle */ + up += MUL_BASECASE_MAX_UN; + un -= MUL_BASECASE_MAX_UN; + } + if (un > vn) + { + mpn_mul_basecase (prodp, up, un, vp, vn); + } + else + { + ASSERT (un > 0); + mpn_mul_basecase (prodp, vp, vn, up, un); + } + cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ + mpn_incr_u (prodp + vn, cy); + } + } + else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD)) + { + /* Use ToomX2 variants */ + mp_ptr scratch; + TMP_SDECL; TMP_SMARK; + +#define ITCH_TOOMX2 (9 * vn / 2 + GMP_NUMB_BITS * 2) + scratch = TMP_SALLOC_LIMBS (ITCH_TOOMX2); + ASSERT (mpn_toom22_mul_itch ((5*vn-1)/4, vn) <= ITCH_TOOMX2); /* 5vn/2+ */ + ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX2); /* 7vn/6+ */ + ASSERT (mpn_toom42_mul_itch (3 * vn - 1, vn) <= ITCH_TOOMX2); /* 9vn/2+ */ +#undef ITCH_TOOMX2 + + /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn + square to a (3vn-1)*vn rectangle. Leaving such a rectangle is hardly + wise; we would get better balance by slightly moving the bound. We + will sometimes end up with un < vn, like in the X3 arm below. */ + if (un >= 3 * vn) + { + mp_limb_t cy; + mp_ptr ws; + + /* The maximum ws usage is for the mpn_mul result. */ + ws = TMP_SALLOC_LIMBS (4 * vn); + + mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch); + un -= 2 * vn; + up += 2 * vn; + prodp += 2 * vn; + + while (un >= 3 * vn) + { + mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch); + un -= 2 * vn; + up += 2 * vn; + cy = mpn_add_n (prodp, prodp, ws, vn); + MPN_COPY (prodp + vn, ws + vn, 2 * vn); + mpn_incr_u (prodp + vn, cy); + prodp += 2 * vn; + } + + /* vn <= un < 3vn */ + + if (4 * un < 5 * vn) + mpn_toom22_mul (ws, up, un, vp, vn, scratch); + else if (4 * un < 7 * vn) + mpn_toom32_mul (ws, up, un, vp, vn, scratch); + else + mpn_toom42_mul (ws, up, un, vp, vn, scratch); + + cy = mpn_add_n (prodp, prodp, ws, vn); + MPN_COPY (prodp + vn, ws + vn, un); + mpn_incr_u (prodp + vn, cy); + } + else + { + if (4 * un < 5 * vn) + mpn_toom22_mul (prodp, up, un, vp, vn, scratch); + else if (4 * un < 7 * vn) + mpn_toom32_mul (prodp, up, un, vp, vn, scratch); + else + mpn_toom42_mul (prodp, up, un, vp, vn, scratch); + } + TMP_SFREE; + } + else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) || + BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD)) + { + /* Handle the largest operands that are not in the FFT range. The 2nd + condition makes very unbalanced operands avoid the FFT code (except + perhaps as coefficient products of the Toom code. */ + + if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn)) + { + /* Use ToomX3 variants */ + mp_ptr scratch; + TMP_DECL; TMP_MARK; + +#define ITCH_TOOMX3 (4 * vn + GMP_NUMB_BITS) + scratch = TMP_ALLOC_LIMBS (ITCH_TOOMX3); + ASSERT (mpn_toom33_mul_itch ((7*vn-1)/6, vn) <= ITCH_TOOMX3); /* 7vn/2+ */ + ASSERT (mpn_toom43_mul_itch ((3*vn-1)/2, vn) <= ITCH_TOOMX3); /* 9vn/4+ */ + ASSERT (mpn_toom32_mul_itch ((7*vn-1)/4, vn) <= ITCH_TOOMX3); /* 7vn/6+ */ + ASSERT (mpn_toom53_mul_itch ((11*vn-1)/6, vn) <= ITCH_TOOMX3); /* 11vn/3+ */ + ASSERT (mpn_toom42_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */ + ASSERT (mpn_toom63_mul_itch ((5*vn-1)/2, vn) <= ITCH_TOOMX3); /* 15vn/4+ */ +#undef ITCH_TOOMX3 + + if (2 * un >= 5 * vn) + { + mp_limb_t cy; + mp_ptr ws; + + /* The maximum ws usage is for the mpn_mul result. */ + ws = TMP_ALLOC_LIMBS (7 * vn >> 1); + + if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD)) + mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch); + else + mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch); + un -= 2 * vn; + up += 2 * vn; + prodp += 2 * vn; + + while (2 * un >= 5 * vn) /* un >= 2.5vn */ + { + if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD)) + mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch); + else + mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch); + un -= 2 * vn; + up += 2 * vn; + cy = mpn_add_n (prodp, prodp, ws, vn); + MPN_COPY (prodp + vn, ws + vn, 2 * vn); + mpn_incr_u (prodp + vn, cy); + prodp += 2 * vn; + } + + /* vn / 2 <= un < 2.5vn */ + + if (un < vn) + mpn_mul (ws, vp, vn, up, un); + else + mpn_mul (ws, up, un, vp, vn); + + cy = mpn_add_n (prodp, prodp, ws, vn); + MPN_COPY (prodp + vn, ws + vn, un); + mpn_incr_u (prodp + vn, cy); + } + else + { + if (6 * un < 7 * vn) + mpn_toom33_mul (prodp, up, un, vp, vn, scratch); + else if (2 * un < 3 * vn) + { + if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD)) + mpn_toom32_mul (prodp, up, un, vp, vn, scratch); + else + mpn_toom43_mul (prodp, up, un, vp, vn, scratch); + } + else if (6 * un < 11 * vn) + { + if (4 * un < 7 * vn) + { + if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD)) + mpn_toom32_mul (prodp, up, un, vp, vn, scratch); + else + mpn_toom53_mul (prodp, up, un, vp, vn, scratch); + } + else + { + if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD)) + mpn_toom42_mul (prodp, up, un, vp, vn, scratch); + else + mpn_toom53_mul (prodp, up, un, vp, vn, scratch); + } + } + else + { + if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD)) + mpn_toom42_mul (prodp, up, un, vp, vn, scratch); + else + mpn_toom63_mul (prodp, up, un, vp, vn, scratch); + } + } + TMP_FREE; + } + else + { + mp_ptr scratch; + TMP_DECL; TMP_MARK; + + if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD)) + { + scratch = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (un, vn)); + mpn_toom44_mul (prodp, up, un, vp, vn, scratch); + } + else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD)) + { + scratch = TMP_SALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn)); + mpn_toom6h_mul (prodp, up, un, vp, vn, scratch); + } + else + { + scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn)); + mpn_toom8h_mul (prodp, up, un, vp, vn, scratch); + } + TMP_FREE; + } + } + else + { + if (un >= 8 * vn) + { + mp_limb_t cy; + mp_ptr ws; + TMP_DECL; TMP_MARK; + + /* The maximum ws usage is for the mpn_mul result. */ + ws = TMP_BALLOC_LIMBS (9 * vn >> 1); + + mpn_fft_mul (prodp, up, 3 * vn, vp, vn); + un -= 3 * vn; + up += 3 * vn; + prodp += 3 * vn; + + while (2 * un >= 7 * vn) /* un >= 3.5vn */ + { + mpn_fft_mul (ws, up, 3 * vn, vp, vn); + un -= 3 * vn; + up += 3 * vn; + cy = mpn_add_n (prodp, prodp, ws, vn); + MPN_COPY (prodp + vn, ws + vn, 3 * vn); + mpn_incr_u (prodp + vn, cy); + prodp += 3 * vn; + } + + /* vn / 2 <= un < 3.5vn */ + + if (un < vn) + mpn_mul (ws, vp, vn, up, un); + else + mpn_mul (ws, up, un, vp, vn); + + cy = mpn_add_n (prodp, prodp, ws, vn); + MPN_COPY (prodp + vn, ws + vn, un); + mpn_incr_u (prodp + vn, cy); + + TMP_FREE; + } + else + mpn_fft_mul (prodp, up, un, vp, vn); + } + + return prodp[un + vn - 1]; /* historic */ +} diff --git a/gmp-6.3.0/mpn/generic/mul_1.c b/gmp-6.3.0/mpn/generic/mul_1.c new file mode 100644 index 0000000..52d46da --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mul_1.c @@ -0,0 +1,96 @@ +/* mpn_mul_1 -- Multiply a limb vector with a single limb and store the + product in a second limb vector. + +Copyright 1991-1994, 1996, 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +#if GMP_NAIL_BITS == 0 + +mp_limb_t +mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl) +{ + mp_limb_t ul, cl, hpl, lpl; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_INCR_P (rp, up, n)); + + cl = 0; + do + { + ul = *up++; + umul_ppmm (hpl, lpl, ul, vl); + + lpl += cl; + cl = (lpl < cl) + hpl; + + *rp++ = lpl; + } + while (--n != 0); + + return cl; +} + +#endif + +#if GMP_NAIL_BITS >= 1 + +mp_limb_t +mpn_mul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t vl) +{ + mp_limb_t shifted_vl, ul, lpl, hpl, prev_hpl, xw, cl, xl; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_INCR_P (rp, up, n)); + ASSERT_MPN (up, n); + ASSERT_LIMB (vl); + + shifted_vl = vl << GMP_NAIL_BITS; + cl = 0; + prev_hpl = 0; + do + { + ul = *up++; + + umul_ppmm (hpl, lpl, ul, shifted_vl); + lpl >>= GMP_NAIL_BITS; + xw = prev_hpl + lpl + cl; + cl = xw >> GMP_NUMB_BITS; + xl = xw & GMP_NUMB_MASK; + *rp++ = xl; + prev_hpl = hpl; + } + while (--n != 0); + + return prev_hpl + cl; +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/mul_basecase.c b/gmp-6.3.0/mpn/generic/mul_basecase.c new file mode 100644 index 0000000..2487fba --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mul_basecase.c @@ -0,0 +1,165 @@ +/* mpn_mul_basecase -- Internal routine to multiply two natural numbers + of length m and n. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + +Copyright 1991-1994, 1996, 1997, 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* Multiply {up,usize} by {vp,vsize} and write the result to + {prodp,usize+vsize}. Must have usize>=vsize. + + Note that prodp gets usize+vsize limbs stored, even if the actual result + only needs usize+vsize-1. + + There's no good reason to call here with vsize>=MUL_TOOM22_THRESHOLD. + Currently this is allowed, but it might not be in the future. + + This is the most critical code for multiplication. All multiplies rely + on this, both small and huge. Small ones arrive here immediately, huge + ones arrive here as this is the base case for Karatsuba's recursive + algorithm. */ + +void +mpn_mul_basecase (mp_ptr rp, + mp_srcptr up, mp_size_t un, + mp_srcptr vp, mp_size_t vn) +{ + ASSERT (un >= vn); + ASSERT (vn >= 1); + ASSERT (! MPN_OVERLAP_P (rp, un+vn, up, un)); + ASSERT (! MPN_OVERLAP_P (rp, un+vn, vp, vn)); + + /* We first multiply by the low order limb (or depending on optional function + availability, limbs). This result can be stored, not added, to rp. We + also avoid a loop for zeroing this way. */ + +#if HAVE_NATIVE_mpn_mul_2 + if (vn >= 2) + { + rp[un + 1] = mpn_mul_2 (rp, up, un, vp); + rp += 2, vp += 2, vn -= 2; + } + else + { + rp[un] = mpn_mul_1 (rp, up, un, vp[0]); + return; + } +#else + rp[un] = mpn_mul_1 (rp, up, un, vp[0]); + rp += 1, vp += 1, vn -= 1; +#endif + + /* Now accumulate the product of up[] and the next higher limb (or depending + on optional function availability, limbs) from vp[]. */ + +#define MAX_LEFT MP_SIZE_T_MAX /* Used to simplify loops into if statements */ + + +#if HAVE_NATIVE_mpn_addmul_6 + while (vn >= 6) + { + rp[un + 6 - 1] = mpn_addmul_6 (rp, up, un, vp); + if (MAX_LEFT == 6) + return; + rp += 6, vp += 6, vn -= 6; + if (MAX_LEFT < 2 * 6) + break; + } +#undef MAX_LEFT +#define MAX_LEFT (6 - 1) +#endif + +#if HAVE_NATIVE_mpn_addmul_5 + while (vn >= 5) + { + rp[un + 5 - 1] = mpn_addmul_5 (rp, up, un, vp); + if (MAX_LEFT == 5) + return; + rp += 5, vp += 5, vn -= 5; + if (MAX_LEFT < 2 * 5) + break; + } +#undef MAX_LEFT +#define MAX_LEFT (5 - 1) +#endif + +#if HAVE_NATIVE_mpn_addmul_4 + while (vn >= 4) + { + rp[un + 4 - 1] = mpn_addmul_4 (rp, up, un, vp); + if (MAX_LEFT == 4) + return; + rp += 4, vp += 4, vn -= 4; + if (MAX_LEFT < 2 * 4) + break; + } +#undef MAX_LEFT +#define MAX_LEFT (4 - 1) +#endif + +#if HAVE_NATIVE_mpn_addmul_3 + while (vn >= 3) + { + rp[un + 3 - 1] = mpn_addmul_3 (rp, up, un, vp); + if (MAX_LEFT == 3) + return; + rp += 3, vp += 3, vn -= 3; + if (MAX_LEFT < 2 * 3) + break; + } +#undef MAX_LEFT +#define MAX_LEFT (3 - 1) +#endif + +#if HAVE_NATIVE_mpn_addmul_2 + while (vn >= 2) + { + rp[un + 2 - 1] = mpn_addmul_2 (rp, up, un, vp); + if (MAX_LEFT == 2) + return; + rp += 2, vp += 2, vn -= 2; + if (MAX_LEFT < 2 * 2) + break; + } +#undef MAX_LEFT +#define MAX_LEFT (2 - 1) +#endif + + while (vn >= 1) + { + rp[un] = mpn_addmul_1 (rp, up, un, vp[0]); + if (MAX_LEFT == 1) + return; + rp += 1, vp += 1, vn -= 1; + } +} diff --git a/gmp-6.3.0/mpn/generic/mul_fft.c b/gmp-6.3.0/mpn/generic/mul_fft.c new file mode 100644 index 0000000..76a2106 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mul_fft.c @@ -0,0 +1,1105 @@ +/* Schoenhage's fast multiplication modulo 2^N+1. + + Contributed by Paul Zimmermann. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 1998-2010, 2012, 2013, 2018, 2020, 2022 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* References: + + Schnelle Multiplikation grosser Zahlen, by Arnold Schoenhage and Volker + Strassen, Computing 7, p. 281-292, 1971. + + Asymptotically fast algorithms for the numerical multiplication and division + of polynomials with complex coefficients, by Arnold Schoenhage, Computer + Algebra, EUROCAM'82, LNCS 144, p. 3-15, 1982. + + Tapes versus Pointers, a study in implementing fast algorithms, by Arnold + Schoenhage, Bulletin of the EATCS, 30, p. 23-32, 1986. + + TODO: + + Implement some of the tricks published at ISSAC'2007 by Gaudry, Kruppa, and + Zimmermann. + + It might be possible to avoid a small number of MPN_COPYs by using a + rotating temporary or two. + + Cleanup and simplify the code! +*/ + +#ifdef TRACE +#undef TRACE +#define TRACE(x) x +#include +#else +#define TRACE(x) +#endif + +#include "gmp-impl.h" + +#ifdef WANT_ADDSUB +#include "generic/add_n_sub_n.c" +#define HAVE_NATIVE_mpn_add_n_sub_n 1 +#endif + +static mp_limb_t mpn_mul_fft_internal (mp_ptr, mp_size_t, int, mp_ptr *, + mp_ptr *, mp_ptr, mp_ptr, mp_size_t, + mp_size_t, mp_size_t, int **, mp_ptr, int); +static void mpn_mul_fft_decompose (mp_ptr, mp_ptr *, mp_size_t, mp_size_t, mp_srcptr, + mp_size_t, mp_size_t, mp_size_t, mp_ptr); + + +/* Find the best k to use for a mod 2^(m*GMP_NUMB_BITS)+1 FFT for m >= n. + We have sqr=0 if for a multiply, sqr=1 for a square. + There are three generations of this code; we keep the old ones as long as + some gmp-mparam.h is not updated. */ + + +/*****************************************************************************/ + +#if TUNE_PROGRAM_BUILD || (defined (MUL_FFT_TABLE3) && defined (SQR_FFT_TABLE3)) + +#ifndef FFT_TABLE3_SIZE /* When tuning this is defined in gmp-impl.h */ +#if defined (MUL_FFT_TABLE3_SIZE) && defined (SQR_FFT_TABLE3_SIZE) +#if MUL_FFT_TABLE3_SIZE > SQR_FFT_TABLE3_SIZE +#define FFT_TABLE3_SIZE MUL_FFT_TABLE3_SIZE +#else +#define FFT_TABLE3_SIZE SQR_FFT_TABLE3_SIZE +#endif +#endif +#endif + +#ifndef FFT_TABLE3_SIZE +#define FFT_TABLE3_SIZE 200 +#endif + +FFT_TABLE_ATTRS struct fft_table_nk mpn_fft_table3[2][FFT_TABLE3_SIZE] = +{ + MUL_FFT_TABLE3, + SQR_FFT_TABLE3 +}; + +int +mpn_fft_best_k (mp_size_t n, int sqr) +{ + const struct fft_table_nk *fft_tab, *tab; + mp_size_t tab_n, thres; + int last_k; + + fft_tab = mpn_fft_table3[sqr]; + last_k = fft_tab->k; + for (tab = fft_tab + 1; ; tab++) + { + tab_n = tab->n; + thres = tab_n << last_k; + if (n <= thres) + break; + last_k = tab->k; + } + return last_k; +} + +#define MPN_FFT_BEST_READY 1 +#endif + +/*****************************************************************************/ + +#if ! defined (MPN_FFT_BEST_READY) +FFT_TABLE_ATTRS mp_size_t mpn_fft_table[2][MPN_FFT_TABLE_SIZE] = +{ + MUL_FFT_TABLE, + SQR_FFT_TABLE +}; + +int +mpn_fft_best_k (mp_size_t n, int sqr) +{ + int i; + + for (i = 0; mpn_fft_table[sqr][i] != 0; i++) + if (n < mpn_fft_table[sqr][i]) + return i + FFT_FIRST_K; + + /* treat 4*last as one further entry */ + if (i == 0 || n < 4 * mpn_fft_table[sqr][i - 1]) + return i + FFT_FIRST_K; + else + return i + FFT_FIRST_K + 1; +} +#endif + +/*****************************************************************************/ + + +/* Returns smallest possible number of limbs >= pl for a fft of size 2^k, + i.e. smallest multiple of 2^k >= pl. + + Don't declare static: needed by tuneup. +*/ + +mp_size_t +mpn_fft_next_size (mp_size_t pl, int k) +{ + pl = 1 + ((pl - 1) >> k); /* ceil (pl/2^k) */ + return pl << k; +} + + +/* Initialize l[i][j] with bitrev(j) */ +static void +mpn_fft_initl (int **l, int k) +{ + int i, j, K; + int *li; + + l[0][0] = 0; + for (i = 1, K = 1; i <= k; i++, K *= 2) + { + li = l[i]; + for (j = 0; j < K; j++) + { + li[j] = 2 * l[i - 1][j]; + li[K + j] = 1 + li[j]; + } + } +} + + +/* r <- a*2^d mod 2^(n*GMP_NUMB_BITS)+1 with a = {a, n+1} + Assumes a is semi-normalized, i.e. a[n] <= 1. + r and a must have n+1 limbs, and not overlap. +*/ +static void +mpn_fft_mul_2exp_modF (mp_ptr r, mp_srcptr a, mp_bitcnt_t d, mp_size_t n) +{ + unsigned int sh; + mp_size_t m; + mp_limb_t cc, rd; + + sh = d % GMP_NUMB_BITS; + m = d / GMP_NUMB_BITS; + + if (m >= n) /* negate */ + { + /* r[0..m-1] <-- lshift(a[n-m]..a[n-1], sh) + r[m..n-1] <-- -lshift(a[0]..a[n-m-1], sh) */ + + m -= n; + if (sh != 0) + { + /* no out shift below since a[n] <= 1 */ + mpn_lshift (r, a + n - m, m + 1, sh); + rd = r[m]; + cc = mpn_lshiftc (r + m, a, n - m, sh); + } + else + { + MPN_COPY (r, a + n - m, m); + rd = a[n]; + mpn_com (r + m, a, n - m); + cc = 0; + } + + /* add cc to r[0], and add rd to r[m] */ + + /* now add 1 in r[m], subtract 1 in r[n], i.e. add 1 in r[0] */ + + r[n] = 0; + /* cc < 2^sh <= 2^(GMP_NUMB_BITS-1) thus no overflow here */ + ++cc; + MPN_INCR_U (r, n + 1, cc); + + ++rd; + /* rd might overflow when sh=GMP_NUMB_BITS-1 */ + cc = rd + (rd == 0); + r = r + m + (rd == 0); + MPN_INCR_U (r, n + 1 - m - (rd == 0), cc); + } + else + { + /* r[0..m-1] <-- -lshift(a[n-m]..a[n-1], sh) + r[m..n-1] <-- lshift(a[0]..a[n-m-1], sh) */ + if (sh != 0) + { + /* no out bits below since a[n] <= 1 */ + mpn_lshiftc (r, a + n - m, m + 1, sh); + rd = ~r[m]; + /* {r, m+1} = {a+n-m, m+1} << sh */ + cc = mpn_lshift (r + m, a, n - m, sh); /* {r+m, n-m} = {a, n-m}<GMP_NUMB_MAX+1. Never triggered. + Is it actually possible? */ + r[n] = 0; + MPN_INCR_U (r, n + 1, cy); + } + } +} + +#if HAVE_NATIVE_mpn_add_n_sub_n +static inline void +mpn_fft_add_sub_modF (mp_ptr A0, mp_ptr Ai, mp_srcptr tp, mp_size_t n) +{ + mp_limb_t cyas, c, x; + + cyas = mpn_add_n_sub_n (A0, Ai, A0, tp, n); + + c = A0[n] - tp[n] - (cyas & 1); + x = (-c) & -((c & GMP_LIMB_HIGHBIT) != 0); + Ai[n] = x + c; + MPN_INCR_U (Ai, n + 1, x); + + c = A0[n] + tp[n] + (cyas >> 1); + x = (c - 1) & -(c != 0); + A0[n] = c - x; + MPN_DECR_U (A0, n + 1, x); +} + +#else /* ! HAVE_NATIVE_mpn_add_n_sub_n */ + +/* r <- a+b mod 2^(n*GMP_NUMB_BITS)+1. + Assumes a and b are semi-normalized. +*/ +static inline void +mpn_fft_add_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, mp_size_t n) +{ + mp_limb_t c, x; + + c = a[n] + b[n] + mpn_add_n (r, a, b, n); + /* 0 <= c <= 3 */ + +#if 1 + /* GCC 4.1 outsmarts most expressions here, and generates a 50% branch. The + result is slower code, of course. But the following outsmarts GCC. */ + x = (c - 1) & -(c != 0); + r[n] = c - x; + MPN_DECR_U (r, n + 1, x); +#endif +#if 0 + if (c > 1) + { + r[n] = 1; /* r[n] - c = 1 */ + MPN_DECR_U (r, n + 1, c - 1); + } + else + { + r[n] = c; + } +#endif +} + +/* r <- a-b mod 2^(n*GMP_NUMB_BITS)+1. + Assumes a and b are semi-normalized. +*/ +static inline void +mpn_fft_sub_modF (mp_ptr r, mp_srcptr a, mp_srcptr b, mp_size_t n) +{ + mp_limb_t c, x; + + c = a[n] - b[n] - mpn_sub_n (r, a, b, n); + /* -2 <= c <= 1 */ + +#if 1 + /* GCC 4.1 outsmarts most expressions here, and generates a 50% branch. The + result is slower code, of course. But the following outsmarts GCC. */ + x = (-c) & -((c & GMP_LIMB_HIGHBIT) != 0); + r[n] = x + c; + MPN_INCR_U (r, n + 1, x); +#endif +#if 0 + if ((c & GMP_LIMB_HIGHBIT) != 0) + { + r[n] = 0; + MPN_INCR_U (r, n + 1, -c); + } + else + { + r[n] = c; + } +#endif +} +#endif /* HAVE_NATIVE_mpn_add_n_sub_n */ + +/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where + N=n*GMP_NUMB_BITS, and 2^omega is a primitive root mod 2^N+1 + output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 */ + +static void +mpn_fft_fft (mp_ptr *Ap, mp_size_t K, int **ll, + mp_size_t omega, mp_size_t n, mp_size_t inc, mp_ptr tp) +{ + if (K == 2) + { + mp_limb_t cy; +#if HAVE_NATIVE_mpn_add_n_sub_n + cy = mpn_add_n_sub_n (Ap[0], Ap[inc], Ap[0], Ap[inc], n + 1) & 1; +#else + MPN_COPY (tp, Ap[0], n + 1); + mpn_add_n (Ap[0], Ap[0], Ap[inc], n + 1); + cy = mpn_sub_n (Ap[inc], tp, Ap[inc], n + 1); +#endif + if (Ap[0][n] > 1) /* can be 2 or 3 */ + { /* Ap[0][n] = 1 - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - 1); */ + mp_limb_t cc = Ap[0][n] - 1; + Ap[0][n] = 1; + MPN_DECR_U (Ap[0], n + 1, cc); + } + if (cy) /* Ap[inc][n] can be -1 or -2 */ + { /* Ap[inc][n] = mpn_add_1 (Ap[inc], Ap[inc], n, ~Ap[inc][n] + 1); */ + mp_limb_t cc = ~Ap[inc][n] + 1; + Ap[inc][n] = 0; + MPN_INCR_U (Ap[inc], n + 1, cc); + } + } + else + { + mp_size_t j, K2 = K >> 1; + int *lk = *ll; + + mpn_fft_fft (Ap, K2, ll-1, 2 * omega, n, inc * 2, tp); + mpn_fft_fft (Ap+inc, K2, ll-1, 2 * omega, n, inc * 2, tp); + /* A[2*j*inc] <- A[2*j*inc] + omega^l[k][2*j*inc] A[(2j+1)inc] + A[(2j+1)inc] <- A[2*j*inc] + omega^l[k][(2j+1)inc] A[(2j+1)inc] */ + for (j = 0; j < K2; j++, lk += 2, Ap += 2 * inc) + { + /* Ap[inc] <- Ap[0] + Ap[inc] * 2^(lk[1] * omega) + Ap[0] <- Ap[0] + Ap[inc] * 2^(lk[0] * omega) */ + mpn_fft_mul_2exp_modF (tp, Ap[inc], lk[0] * omega, n); +#if HAVE_NATIVE_mpn_add_n_sub_n + mpn_fft_add_sub_modF (Ap[0], Ap[inc], tp, n); +#else + mpn_fft_sub_modF (Ap[inc], Ap[0], tp, n); + mpn_fft_add_modF (Ap[0], Ap[0], tp, n); +#endif + } + } +} + +/* input: A[0] ... A[inc*(K-1)] are residues mod 2^N+1 where + N=n*GMP_NUMB_BITS, and 2^omega is a primitive root mod 2^N+1 + output: A[inc*l[k][i]] <- \sum (2^omega)^(ij) A[inc*j] mod 2^N+1 + tp must have space for 2*(n+1) limbs. +*/ + + +/* Given ap[0..n] with ap[n]<=1, reduce it modulo 2^(n*GMP_NUMB_BITS)+1, + by subtracting that modulus if necessary. + + If ap[0..n] is exactly 2^(n*GMP_NUMB_BITS) then mpn_sub_1 produces a + borrow and the limbs must be zeroed out again. This will occur very + infrequently. */ + +static inline void +mpn_fft_normalize (mp_ptr ap, mp_size_t n) +{ + if (ap[n] != 0) + { + MPN_DECR_U (ap, n + 1, CNST_LIMB(1)); + if (ap[n] == 0) + { + /* This happens with very low probability; we have yet to trigger it, + and thereby make sure this code is correct. */ + MPN_ZERO (ap, n); + ap[n] = 1; + } + else + ap[n] = 0; + } +} + +/* a[i] <- a[i]*b[i] mod 2^(n*GMP_NUMB_BITS)+1 for 0 <= i < K */ +static void +mpn_fft_mul_modF_K (mp_ptr *ap, mp_ptr *bp, mp_size_t n, mp_size_t K) +{ + int i; + unsigned k; + int sqr = (ap == bp); + TMP_DECL; + + TMP_MARK; + + if (n >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) + { + mp_size_t K2, nprime2, Nprime2, M2, maxLK, l, Mp2; + int k; + int **fft_l, *tmp; + mp_ptr *Ap, *Bp, A, B, T; + + k = mpn_fft_best_k (n, sqr); + K2 = (mp_size_t) 1 << k; + ASSERT_ALWAYS((n & (K2 - 1)) == 0); + maxLK = (K2 > GMP_NUMB_BITS) ? K2 : GMP_NUMB_BITS; + M2 = n * GMP_NUMB_BITS >> k; + l = n >> k; + Nprime2 = ((2 * M2 + k + 2 + maxLK) / maxLK) * maxLK; + /* Nprime2 = ceil((2*M2+k+3)/maxLK)*maxLK*/ + nprime2 = Nprime2 / GMP_NUMB_BITS; + + /* we should ensure that nprime2 is a multiple of the next K */ + if (nprime2 >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) + { + mp_size_t K3; + for (;;) + { + K3 = (mp_size_t) 1 << mpn_fft_best_k (nprime2, sqr); + if ((nprime2 & (K3 - 1)) == 0) + break; + nprime2 = (nprime2 + K3 - 1) & -K3; + Nprime2 = nprime2 * GMP_LIMB_BITS; + /* warning: since nprime2 changed, K3 may change too! */ + } + } + ASSERT_ALWAYS(nprime2 < n); /* otherwise we'll loop */ + + Mp2 = Nprime2 >> k; + + Ap = TMP_BALLOC_MP_PTRS (K2); + Bp = TMP_BALLOC_MP_PTRS (K2); + A = TMP_BALLOC_LIMBS (2 * (nprime2 + 1) << k); + T = TMP_BALLOC_LIMBS (2 * (nprime2 + 1)); + B = A + ((nprime2 + 1) << k); + fft_l = TMP_BALLOC_TYPE (k + 1, int *); + tmp = TMP_BALLOC_TYPE ((size_t) 2 << k, int); + for (i = 0; i <= k; i++) + { + fft_l[i] = tmp; + tmp += (mp_size_t) 1 << i; + } + + mpn_fft_initl (fft_l, k); + + TRACE (printf ("recurse: %ldx%ld limbs -> %ld times %ldx%ld (%1.2f)\n", n, + n, K2, nprime2, nprime2, 2.0*(double)n/nprime2/K2)); + for (i = 0; i < K; i++, ap++, bp++) + { + mp_limb_t cy; + mpn_fft_normalize (*ap, n); + if (!sqr) + mpn_fft_normalize (*bp, n); + + mpn_mul_fft_decompose (A, Ap, K2, nprime2, *ap, (l << k) + 1, l, Mp2, T); + if (!sqr) + mpn_mul_fft_decompose (B, Bp, K2, nprime2, *bp, (l << k) + 1, l, Mp2, T); + + cy = mpn_mul_fft_internal (*ap, n, k, Ap, Bp, A, B, nprime2, + l, Mp2, fft_l, T, sqr); + (*ap)[n] = cy; + } + } +#if ! TUNE_PROGRAM_BUILD + else if (MPN_MULMOD_BKNP1_USABLE (n, k, MUL_FFT_MODF_THRESHOLD)) + { + mp_ptr a; + mp_size_t n_k = n / k; + + if (sqr) + { + mp_ptr tp = TMP_SALLOC_LIMBS (mpn_sqrmod_bknp1_itch (n)); + for (i = 0; i < K; i++) + { + a = *ap++; + mpn_sqrmod_bknp1 (a, a, n_k, k, tp); + } + } + else + { + mp_ptr b, tp = TMP_SALLOC_LIMBS (mpn_mulmod_bknp1_itch (n)); + for (i = 0; i < K; i++) + { + a = *ap++; + b = *bp++; + mpn_mulmod_bknp1 (a, a, b, n_k, k, tp); + } + } + } +#endif + else + { + mp_ptr a, b, tp, tpn; + mp_limb_t cc; + mp_size_t n2 = 2 * n; + tp = TMP_BALLOC_LIMBS (n2); + tpn = tp + n; + TRACE (printf (" mpn_mul_n %ld of %ld limbs\n", K, n)); + for (i = 0; i < K; i++) + { + a = *ap++; + b = *bp++; + if (sqr) + mpn_sqr (tp, a, n); + else + mpn_mul_n (tp, b, a, n); + if (a[n] != 0) + cc = mpn_add_n (tpn, tpn, b, n); + else + cc = 0; + if (b[n] != 0) + cc += mpn_add_n (tpn, tpn, a, n) + a[n]; + if (cc != 0) + { + cc = mpn_add_1 (tp, tp, n2, cc); + /* If mpn_add_1 give a carry (cc != 0), + the result (tp) is at most GMP_NUMB_MAX - 1, + so the following addition can't overflow. + */ + tp[0] += cc; + } + cc = mpn_sub_n (a, tp, tpn, n); + a[n] = 0; + MPN_INCR_U (a, n + 1, cc); + } + } + TMP_FREE; +} + + +/* input: A^[l[k][0]] A^[l[k][1]] ... A^[l[k][K-1]] + output: K*A[0] K*A[K-1] ... K*A[1]. + Assumes the Ap[] are pseudo-normalized, i.e. 0 <= Ap[][n] <= 1. + This condition is also fulfilled at exit. +*/ +static void +mpn_fft_fftinv (mp_ptr *Ap, mp_size_t K, mp_size_t omega, mp_size_t n, mp_ptr tp) +{ + if (K == 2) + { + mp_limb_t cy; +#if HAVE_NATIVE_mpn_add_n_sub_n + cy = mpn_add_n_sub_n (Ap[0], Ap[1], Ap[0], Ap[1], n + 1) & 1; +#else + MPN_COPY (tp, Ap[0], n + 1); + mpn_add_n (Ap[0], Ap[0], Ap[1], n + 1); + cy = mpn_sub_n (Ap[1], tp, Ap[1], n + 1); +#endif + if (Ap[0][n] > 1) /* can be 2 or 3 */ + { /* Ap[0][n] = 1 - mpn_sub_1 (Ap[0], Ap[0], n, Ap[0][n] - 1); */ + mp_limb_t cc = Ap[0][n] - 1; + Ap[0][n] = 1; + MPN_DECR_U (Ap[0], n + 1, cc); + } + if (cy) /* Ap[1][n] can be -1 or -2 */ + { /* Ap[1][n] = mpn_add_1 (Ap[1], Ap[1], n, ~Ap[1][n] + 1); */ + mp_limb_t cc = ~Ap[1][n] + 1; + Ap[1][n] = 0; + MPN_INCR_U (Ap[1], n + 1, cc); + } + } + else + { + mp_size_t j, K2 = K >> 1; + + mpn_fft_fftinv (Ap, K2, 2 * omega, n, tp); + mpn_fft_fftinv (Ap + K2, K2, 2 * omega, n, tp); + /* A[j] <- A[j] + omega^j A[j+K/2] + A[j+K/2] <- A[j] + omega^(j+K/2) A[j+K/2] */ + for (j = 0; j < K2; j++, Ap++) + { + /* Ap[K2] <- Ap[0] + Ap[K2] * 2^((j + K2) * omega) + Ap[0] <- Ap[0] + Ap[K2] * 2^(j * omega) */ + mpn_fft_mul_2exp_modF (tp, Ap[K2], j * omega, n); +#if HAVE_NATIVE_mpn_add_n_sub_n + mpn_fft_add_sub_modF (Ap[0], Ap[K2], tp, n); +#else + mpn_fft_sub_modF (Ap[K2], Ap[0], tp, n); + mpn_fft_add_modF (Ap[0], Ap[0], tp, n); +#endif + } + } +} + + +/* R <- A/2^k mod 2^(n*GMP_NUMB_BITS)+1 */ +static void +mpn_fft_div_2exp_modF (mp_ptr r, mp_srcptr a, mp_bitcnt_t k, mp_size_t n) +{ + mp_bitcnt_t i; + + ASSERT (r != a); + i = (mp_bitcnt_t) 2 * n * GMP_NUMB_BITS - k; + mpn_fft_mul_2exp_modF (r, a, i, n); + /* 1/2^k = 2^(2nL-k) mod 2^(n*GMP_NUMB_BITS)+1 */ + /* normalize so that R < 2^(n*GMP_NUMB_BITS)+1 */ + mpn_fft_normalize (r, n); +} + + +/* {rp,n} <- {ap,an} mod 2^(n*GMP_NUMB_BITS)+1, n <= an <= 3*n. + Returns carry out, i.e. 1 iff {ap,an} = -1 mod 2^(n*GMP_NUMB_BITS)+1, + then {rp,n}=0. +*/ +static mp_size_t +mpn_fft_norm_modF (mp_ptr rp, mp_size_t n, mp_ptr ap, mp_size_t an) +{ + mp_size_t l, m, rpn; + mp_limb_t cc; + + ASSERT ((n <= an) && (an <= 3 * n)); + m = an - 2 * n; + if (m > 0) + { + l = n; + /* add {ap, m} and {ap+2n, m} in {rp, m} */ + cc = mpn_add_n (rp, ap, ap + 2 * n, m); + /* copy {ap+m, n-m} to {rp+m, n-m} */ + rpn = mpn_add_1 (rp + m, ap + m, n - m, cc); + } + else + { + l = an - n; /* l <= n */ + MPN_COPY (rp, ap, n); + rpn = 0; + } + + /* remains to subtract {ap+n, l} from {rp, n+1} */ + rpn -= mpn_sub (rp, rp, n, ap + n, l); + if (rpn < 0) /* necessarily rpn = -1 */ + rpn = mpn_add_1 (rp, rp, n, CNST_LIMB(1)); + return rpn; +} + +/* store in A[0..nprime] the first M bits from {n, nl}, + in A[nprime+1..] the following M bits, ... + Assumes M is a multiple of GMP_NUMB_BITS (M = l * GMP_NUMB_BITS). + T must have space for at least (nprime + 1) limbs. + We must have nl <= 2*K*l. +*/ +static void +mpn_mul_fft_decompose (mp_ptr A, mp_ptr *Ap, mp_size_t K, mp_size_t nprime, + mp_srcptr n, mp_size_t nl, mp_size_t l, mp_size_t Mp, + mp_ptr T) +{ + mp_size_t i, j; + mp_ptr tmp; + mp_size_t Kl = K * l; + TMP_DECL; + TMP_MARK; + + if (nl > Kl) /* normalize {n, nl} mod 2^(Kl*GMP_NUMB_BITS)+1 */ + { + mp_size_t dif = nl - Kl; + + tmp = TMP_BALLOC_LIMBS(Kl + 1); + tmp[Kl] = 0; + +#if ! WANT_OLD_FFT_FULL + ASSERT_ALWAYS (dif <= Kl); +#else + /* The comment "We must have nl <= 2*K*l." says that + ((dif = nl - Kl) > Kl) should never happen. */ + if (UNLIKELY (dif > Kl)) + { + mp_limb_signed_t cy; + int subp = 0; + + cy = mpn_sub_n (tmp, n, n + Kl, Kl); + n += 2 * Kl; + dif -= Kl; + + /* now dif > 0 */ + while (dif > Kl) + { + if (subp) + cy += mpn_sub_n (tmp, tmp, n, Kl); + else + cy -= mpn_add_n (tmp, tmp, n, Kl); + subp ^= 1; + n += Kl; + dif -= Kl; + } + /* now dif <= Kl */ + if (subp) + cy += mpn_sub (tmp, tmp, Kl, n, dif); + else + cy -= mpn_add (tmp, tmp, Kl, n, dif); + if (cy >= 0) + MPN_INCR_U (tmp, Kl + 1, cy); + else + { + tmp[Kl] = 1; + MPN_DECR_U (tmp, Kl + 1, -cy - 1); + } + } + else /* dif <= Kl, i.e. nl <= 2 * Kl */ +#endif + { + mp_limb_t cy; + cy = mpn_sub (tmp, n, Kl, n + Kl, dif); + MPN_INCR_U (tmp, Kl + 1, cy); + } + nl = Kl + 1; + n = tmp; + } + for (i = 0; i < K; i++) + { + Ap[i] = A; + /* store the next M bits of n into A[0..nprime] */ + if (nl > 0) /* nl is the number of remaining limbs */ + { + j = (l <= nl && i < K - 1) ? l : nl; /* store j next limbs */ + nl -= j; + MPN_COPY (T, n, j); + MPN_ZERO (T + j, nprime + 1 - j); + n += l; + mpn_fft_mul_2exp_modF (A, T, i * Mp, nprime); + } + else + MPN_ZERO (A, nprime + 1); + A += nprime + 1; + } + ASSERT_ALWAYS (nl == 0); + TMP_FREE; +} + +/* op <- n*m mod 2^N+1 with fft of size 2^k where N=pl*GMP_NUMB_BITS + op is pl limbs, its high bit is returned. + One must have pl = mpn_fft_next_size (pl, k). + T must have space for 2 * (nprime + 1) limbs. +*/ + +static mp_limb_t +mpn_mul_fft_internal (mp_ptr op, mp_size_t pl, int k, + mp_ptr *Ap, mp_ptr *Bp, mp_ptr unusedA, mp_ptr B, + mp_size_t nprime, mp_size_t l, mp_size_t Mp, + int **fft_l, mp_ptr T, int sqr) +{ + mp_size_t K, i, pla, lo, sh, j; + mp_ptr p; + mp_limb_t cc; + + K = (mp_size_t) 1 << k; + + /* direct fft's */ + mpn_fft_fft (Ap, K, fft_l + k, 2 * Mp, nprime, 1, T); + if (!sqr) + mpn_fft_fft (Bp, K, fft_l + k, 2 * Mp, nprime, 1, T); + + /* term to term multiplications */ + mpn_fft_mul_modF_K (Ap, sqr ? Ap : Bp, nprime, K); + + /* inverse fft's */ + mpn_fft_fftinv (Ap, K, 2 * Mp, nprime, T); + + /* division of terms after inverse fft */ + Bp[0] = T + nprime + 1; + mpn_fft_div_2exp_modF (Bp[0], Ap[0], k, nprime); + for (i = 1; i < K; i++) + { + Bp[i] = Ap[i - 1]; + mpn_fft_div_2exp_modF (Bp[i], Ap[i], k + (K - i) * Mp, nprime); + } + + /* addition of terms in result p */ + MPN_ZERO (T, nprime + 1); + pla = l * (K - 1) + nprime + 1; /* number of required limbs for p */ + p = B; /* B has K*(n' + 1) limbs, which is >= pla, i.e. enough */ + MPN_ZERO (p, pla); + cc = 0; /* will accumulate the (signed) carry at p[pla] */ + for (i = K - 1, lo = l * i + nprime,sh = l * i; i >= 0; i--,lo -= l,sh -= l) + { + mp_ptr n = p + sh; + + j = (K - i) & (K - 1); + + cc += mpn_add (n, n, pla - sh, Bp[j], nprime + 1); + T[2 * l] = i + 1; /* T = (i + 1)*2^(2*M) */ + if (mpn_cmp (Bp[j], T, nprime + 1) > 0) + { /* subtract 2^N'+1 */ + cc -= mpn_sub_1 (n, n, pla - sh, CNST_LIMB(1)); + cc -= mpn_sub_1 (p + lo, p + lo, pla - lo, CNST_LIMB(1)); + } + } + if (cc == -CNST_LIMB(1)) + { + if ((cc = mpn_add_1 (p + pla - pl, p + pla - pl, pl, CNST_LIMB(1)))) + { + /* p[pla-pl]...p[pla-1] are all zero */ + mpn_sub_1 (p + pla - pl - 1, p + pla - pl - 1, pl + 1, CNST_LIMB(1)); + mpn_sub_1 (p + pla - 1, p + pla - 1, 1, CNST_LIMB(1)); + } + } + else if (cc == 1) + { + if (pla >= 2 * pl) + { + while ((cc = mpn_add_1 (p + pla - 2 * pl, p + pla - 2 * pl, 2 * pl, cc))) + ; + } + else + { + MPN_DECR_U (p + pla - pl, pl, cc); + } + } + else + ASSERT (cc == 0); + + /* here p < 2^(2M) [K 2^(M(K-1)) + (K-1) 2^(M(K-2)) + ... ] + < K 2^(2M) [2^(M(K-1)) + 2^(M(K-2)) + ... ] + < K 2^(2M) 2^(M(K-1))*2 = 2^(M*K+M+k+1) */ + return mpn_fft_norm_modF (op, pl, p, pla); +} + +/* return the lcm of a and 2^k */ +static mp_bitcnt_t +mpn_mul_fft_lcm (mp_bitcnt_t a, int k) +{ + mp_bitcnt_t l = k; + + while (a % 2 == 0 && k > 0) + { + a >>= 1; + k --; + } + return a << l; +} + + +mp_limb_t +mpn_mul_fft (mp_ptr op, mp_size_t pl, + mp_srcptr n, mp_size_t nl, + mp_srcptr m, mp_size_t ml, + int k) +{ + int i; + mp_size_t K, maxLK; + mp_size_t N, Nprime, nprime, M, Mp, l; + mp_ptr *Ap, *Bp, A, T, B; + int **fft_l, *tmp; + int sqr = (n == m && nl == ml); + mp_limb_t h; + TMP_DECL; + + TRACE (printf ("\nmpn_mul_fft pl=%ld nl=%ld ml=%ld k=%d\n", pl, nl, ml, k)); + ASSERT_ALWAYS (mpn_fft_next_size (pl, k) == pl); + + TMP_MARK; + N = pl * GMP_NUMB_BITS; + fft_l = TMP_BALLOC_TYPE (k + 1, int *); + tmp = TMP_BALLOC_TYPE ((size_t) 2 << k, int); + for (i = 0; i <= k; i++) + { + fft_l[i] = tmp; + tmp += (mp_size_t) 1 << i; + } + + mpn_fft_initl (fft_l, k); + K = (mp_size_t) 1 << k; + M = N >> k; /* N = 2^k M */ + l = 1 + (M - 1) / GMP_NUMB_BITS; + maxLK = mpn_mul_fft_lcm (GMP_NUMB_BITS, k); /* lcm (GMP_NUMB_BITS, 2^k) */ + + Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK; + /* Nprime = ceil((2*M+k+3)/maxLK)*maxLK; */ + nprime = Nprime / GMP_NUMB_BITS; + TRACE (printf ("N=%ld K=%ld, M=%ld, l=%ld, maxLK=%ld, Np=%ld, np=%ld\n", + N, K, M, l, maxLK, Nprime, nprime)); + /* we should ensure that recursively, nprime is a multiple of the next K */ + if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD)) + { + mp_size_t K2; + for (;;) + { + K2 = (mp_size_t) 1 << mpn_fft_best_k (nprime, sqr); + if ((nprime & (K2 - 1)) == 0) + break; + nprime = (nprime + K2 - 1) & -K2; + Nprime = nprime * GMP_LIMB_BITS; + /* warning: since nprime changed, K2 may change too! */ + } + TRACE (printf ("new maxLK=%ld, Np=%ld, np=%ld\n", maxLK, Nprime, nprime)); + } + ASSERT_ALWAYS (nprime < pl); /* otherwise we'll loop */ + + T = TMP_BALLOC_LIMBS (2 * (nprime + 1)); + Mp = Nprime >> k; + + TRACE (printf ("%ldx%ld limbs -> %ld times %ldx%ld limbs (%1.2f)\n", + pl, pl, K, nprime, nprime, 2.0 * (double) N / Nprime / K); + printf (" temp space %ld\n", 2 * K * (nprime + 1))); + + A = TMP_BALLOC_LIMBS (K * (nprime + 1)); + Ap = TMP_BALLOC_MP_PTRS (K); + Bp = TMP_BALLOC_MP_PTRS (K); + mpn_mul_fft_decompose (A, Ap, K, nprime, n, nl, l, Mp, T); + if (sqr) + { + mp_size_t pla; + pla = l * (K - 1) + nprime + 1; /* number of required limbs for p */ + B = TMP_BALLOC_LIMBS (pla); + } + else + { + B = TMP_BALLOC_LIMBS (K * (nprime + 1)); + mpn_mul_fft_decompose (B, Bp, K, nprime, m, ml, l, Mp, T); + } + h = mpn_mul_fft_internal (op, pl, k, Ap, Bp, A, B, nprime, l, Mp, fft_l, T, sqr); + + TMP_FREE; + return h; +} + +#if WANT_OLD_FFT_FULL +/* multiply {n, nl} by {m, ml}, and put the result in {op, nl+ml} */ +void +mpn_mul_fft_full (mp_ptr op, + mp_srcptr n, mp_size_t nl, + mp_srcptr m, mp_size_t ml) +{ + mp_ptr pad_op; + mp_size_t pl, pl2, pl3, l; + mp_size_t cc, c2, oldcc; + int k2, k3; + int sqr = (n == m && nl == ml); + + pl = nl + ml; /* total number of limbs of the result */ + + /* perform a fft mod 2^(2N)+1 and one mod 2^(3N)+1. + We must have pl3 = 3/2 * pl2, with pl2 a multiple of 2^k2, and + pl3 a multiple of 2^k3. Since k3 >= k2, both are multiples of 2^k2, + and pl2 must be an even multiple of 2^k2. Thus (pl2,pl3) = + (2*j*2^k2,3*j*2^k2), which works for 3*j <= pl/2^k2 <= 5*j. + We need that consecutive intervals overlap, i.e. 5*j >= 3*(j+1), + which requires j>=2. Thus this scheme requires pl >= 6 * 2^FFT_FIRST_K. */ + + /* ASSERT_ALWAYS(pl >= 6 * (1 << FFT_FIRST_K)); */ + + pl2 = (2 * pl - 1) / 5; /* ceil (2pl/5) - 1 */ + do + { + pl2++; + k2 = mpn_fft_best_k (pl2, sqr); /* best fft size for pl2 limbs */ + pl2 = mpn_fft_next_size (pl2, k2); + pl3 = 3 * pl2 / 2; /* since k>=FFT_FIRST_K=4, pl2 is a multiple of 2^4, + thus pl2 / 2 is exact */ + k3 = mpn_fft_best_k (pl3, sqr); + } + while (mpn_fft_next_size (pl3, k3) != pl3); + + TRACE (printf ("mpn_mul_fft_full nl=%ld ml=%ld -> pl2=%ld pl3=%ld k=%d\n", + nl, ml, pl2, pl3, k2)); + + ASSERT_ALWAYS(pl3 <= pl); + cc = mpn_mul_fft (op, pl3, n, nl, m, ml, k3); /* mu */ + ASSERT(cc == 0); + pad_op = __GMP_ALLOCATE_FUNC_LIMBS (pl2); + cc = mpn_mul_fft (pad_op, pl2, n, nl, m, ml, k2); /* lambda */ + cc = -cc + mpn_sub_n (pad_op, pad_op, op, pl2); /* lambda - low(mu) */ + /* 0 <= cc <= 1 */ + ASSERT(0 <= cc && cc <= 1); + l = pl3 - pl2; /* l = pl2 / 2 since pl3 = 3/2 * pl2 */ + c2 = mpn_add_n (pad_op, pad_op, op + pl2, l); + cc = mpn_add_1 (pad_op + l, pad_op + l, l, (mp_limb_t) c2) - cc; + ASSERT(-1 <= cc && cc <= 1); + if (cc < 0) + cc = mpn_add_1 (pad_op, pad_op, pl2, (mp_limb_t) -cc); + ASSERT(0 <= cc && cc <= 1); + /* now lambda-mu = {pad_op, pl2} - cc mod 2^(pl2*GMP_NUMB_BITS)+1 */ + oldcc = cc; +#if HAVE_NATIVE_mpn_add_n_sub_n + c2 = mpn_add_n_sub_n (pad_op + l, pad_op, pad_op, pad_op + l, l); + cc += c2 >> 1; /* carry out from high <- low + high */ + c2 = c2 & 1; /* borrow out from low <- low - high */ +#else + { + mp_ptr tmp; + TMP_DECL; + + TMP_MARK; + tmp = TMP_BALLOC_LIMBS (l); + MPN_COPY (tmp, pad_op, l); + c2 = mpn_sub_n (pad_op, pad_op, pad_op + l, l); + cc += mpn_add_n (pad_op + l, tmp, pad_op + l, l); + TMP_FREE; + } +#endif + c2 += oldcc; + /* first normalize {pad_op, pl2} before dividing by 2: c2 is the borrow + at pad_op + l, cc is the carry at pad_op + pl2 */ + /* 0 <= cc <= 2 */ + cc -= mpn_sub_1 (pad_op + l, pad_op + l, l, (mp_limb_t) c2); + /* -1 <= cc <= 2 */ + if (cc > 0) + cc = -mpn_sub_1 (pad_op, pad_op, pl2, (mp_limb_t) cc); + /* now -1 <= cc <= 0 */ + if (cc < 0) + cc = mpn_add_1 (pad_op, pad_op, pl2, (mp_limb_t) -cc); + /* now {pad_op, pl2} is normalized, with 0 <= cc <= 1 */ + if (pad_op[0] & 1) /* if odd, add 2^(pl2*GMP_NUMB_BITS)+1 */ + cc += 1 + mpn_add_1 (pad_op, pad_op, pl2, CNST_LIMB(1)); + /* now 0 <= cc <= 2, but cc=2 cannot occur since it would give a carry + out below */ + mpn_rshift (pad_op, pad_op, pl2, 1); /* divide by two */ + if (cc) /* then cc=1 */ + pad_op [pl2 - 1] |= (mp_limb_t) 1 << (GMP_NUMB_BITS - 1); + /* now {pad_op,pl2}-cc = (lambda-mu)/(1-2^(l*GMP_NUMB_BITS)) + mod 2^(pl2*GMP_NUMB_BITS) + 1 */ + c2 = mpn_add_n (op, op, pad_op, pl2); /* no need to add cc (is 0) */ + /* since pl2+pl3 >= pl, necessary the extra limbs (including cc) are zero */ + MPN_COPY (op + pl3, pad_op, pl - pl3); + ASSERT_MPN_ZERO_P (pad_op + pl - pl3, pl2 + pl3 - pl); + __GMP_FREE_FUNC_LIMBS (pad_op, pl2); + /* since the final result has at most pl limbs, no carry out below */ + MPN_INCR_U (op + pl2, pl - pl2, (mp_limb_t) c2); +} +#endif diff --git a/gmp-6.3.0/mpn/generic/mul_n.c b/gmp-6.3.0/mpn/generic/mul_n.c new file mode 100644 index 0000000..36bd923 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mul_n.c @@ -0,0 +1,96 @@ +/* mpn_mul_n -- multiply natural numbers. + +Copyright 1991, 1993, 1994, 1996-2003, 2005, 2008, 2009 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +void +mpn_mul_n (mp_ptr p, mp_srcptr a, mp_srcptr b, mp_size_t n) +{ + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); + ASSERT (! MPN_OVERLAP_P (p, 2 * n, b, n)); + + if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + mpn_mul_basecase (p, a, n, b, n); + } + else if (BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) + { + /* Allocate workspace of fixed size on stack: fast! */ + mp_limb_t ws[mpn_toom22_mul_itch (MUL_TOOM33_THRESHOLD_LIMIT-1, + MUL_TOOM33_THRESHOLD_LIMIT-1)]; + ASSERT (MUL_TOOM33_THRESHOLD <= MUL_TOOM33_THRESHOLD_LIMIT); + mpn_toom22_mul (p, a, n, b, n, ws); + } + else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) + { + mp_ptr ws; + TMP_SDECL; + TMP_SMARK; + ws = TMP_SALLOC_LIMBS (mpn_toom33_mul_itch (n, n)); + mpn_toom33_mul (p, a, n, b, n, ws); + TMP_SFREE; + } + else if (BELOW_THRESHOLD (n, MUL_TOOM6H_THRESHOLD)) + { + mp_ptr ws; + TMP_SDECL; + TMP_SMARK; + ws = TMP_SALLOC_LIMBS (mpn_toom44_mul_itch (n, n)); + mpn_toom44_mul (p, a, n, b, n, ws); + TMP_SFREE; + } + else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD)) + { + mp_ptr ws; + TMP_SDECL; + TMP_SMARK; + ws = TMP_SALLOC_LIMBS (mpn_toom6_mul_n_itch (n)); + mpn_toom6h_mul (p, a, n, b, n, ws); + TMP_SFREE; + } + else if (BELOW_THRESHOLD (n, MUL_FFT_THRESHOLD)) + { + mp_ptr ws; + TMP_DECL; + TMP_MARK; + ws = TMP_ALLOC_LIMBS (mpn_toom8_mul_n_itch (n)); + mpn_toom8h_mul (p, a, n, b, n, ws); + TMP_FREE; + } + else + { + /* The current FFT code allocates its own space. That should probably + change. */ + mpn_fft_mul (p, a, n, b, n); + } +} diff --git a/gmp-6.3.0/mpn/generic/mullo_basecase.c b/gmp-6.3.0/mpn/generic/mullo_basecase.c new file mode 100644 index 0000000..9a4cd3d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mullo_basecase.c @@ -0,0 +1,90 @@ +/* mpn_mullo_basecase -- Internal routine to multiply two natural + numbers of length n and return the low part. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + + +Copyright (C) 2000, 2002, 2004, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* FIXME: Should optionally use mpn_mul_2/mpn_addmul_2. */ + +#ifndef MULLO_VARIANT +#define MULLO_VARIANT 2 +#endif + + +#if MULLO_VARIANT == 1 +void +mpn_mullo_basecase (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_size_t i; + + mpn_mul_1 (rp, up, n, vp[0]); + + for (i = n - 1; i > 0; i--) + { + vp++; + rp++; + mpn_addmul_1 (rp, up, i, vp[0]); + } +} +#endif + + +#if MULLO_VARIANT == 2 +void +mpn_mullo_basecase (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t h; + + h = up[0] * vp[n - 1]; + + if (n != 1) + { + mp_size_t i; + mp_limb_t v0; + + v0 = *vp++; + h += up[n - 1] * v0 + mpn_mul_1 (rp, up, n - 1, v0); + rp++; + + for (i = n - 2; i > 0; i--) + { + v0 = *vp++; + h += up[i] * v0 + mpn_addmul_1 (rp, up, i, v0); + rp++; + } + } + + rp[0] = h; +} +#endif diff --git a/gmp-6.3.0/mpn/generic/mullo_n.c b/gmp-6.3.0/mpn/generic/mullo_n.c new file mode 100644 index 0000000..6f4e7ae --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mullo_n.c @@ -0,0 +1,243 @@ +/* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs + of their products. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS + FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED + THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2004, 2005, 2009, 2010, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY +#define MAYBE_range_basecase 1 +#define MAYBE_range_toom22 1 +#else +#define MAYBE_range_basecase \ + ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11)) +#define MAYBE_range_toom22 \ + ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) ) +#endif + +/* THINK: The DC strategy uses different constants in different Toom's + ranges. Something smoother? +*/ + +/* + Compute the least significant half of the product {xy,n}*{yp,n}, or + formally {rp,n} = {xy,n}*{yp,n} Mod (B^n). + + Above the given threshold, the Divide and Conquer strategy is used. + The operands are split in two, and a full product plus two mullo + are used to obtain the final result. The more natural strategy is to + split in two halves, but this is far from optimal when a + sub-quadratic multiplication is used. + + Mulders suggests an unbalanced split in favour of the full product, + split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2. + + To compute the value of a, we assume that the cost of mullo for a + given size ML(n) is a fraction of the cost of a full product with + same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2; + then we can write: + + ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e + + Given a value for e, want to minimise the value of k, i.e. the + function k=(1-a)^e/(1-2*a^e). + + With e=2, the exponent for schoolbook multiplication, the minimum is + given by the values a=1-a=1/2. + + With e=log(3)/log(2), the exponent for Karatsuba (aka toom22), + Mulders compute (1-a) = 0.694... and we approximate a with 11/36. + + Other possible approximations follow: + e=log(5)/log(3) [Toom-3] -> a ~= 9/40 + e=log(7)/log(4) [Toom-4] -> a ~= 7/39 + e=log(11)/log(6) [Toom-6] -> a ~= 1/8 + e=log(15)/log(8) [Toom-8] -> a ~= 1/10 + + The values above where obtained with the following trivial commands + in the gp-pari shell: + +fun(e,a)=(1-a)^e/(1-2*a^e) +mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)= 2); + ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp, n)); + ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n)); + + /* Divide-and-conquer */ + + /* We need fractional approximation of the value 0 < a <= 1/2 + giving the minimum in the function k=(1-a)^e/(1-2*a^e). + */ + if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11))) + n1 = n >> 1; + else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11))) + n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */ + else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9))) + n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */ + else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9)) + n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */ + /* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */ + else + n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */ + + n2 = n - n1; + + /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0, + y = y1 2^(n2 GMP_NUMB_BITS) + y0 */ + + /* x0 * y0 */ + mpn_mul_n (tp, xp, yp, n2); + MPN_COPY (rp, tp, n2); + + /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */ + if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD)) + mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1); + else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD)) + mpn_mullo_basecase (tp + n, xp + n2, yp, n1); + else + mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n); + mpn_add_n (rp + n2, tp + n2, tp + n, n1); + + /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */ + if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD)) + mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1); + else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD)) + mpn_mullo_basecase (tp + n, xp, yp + n2, n1); + else + mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n); + mpn_add_n (rp + n2, rp + n2, tp + n, n1); +} + +/* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */ +#define MUL_BASECASE_ALLOC \ + (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT) + +/* FIXME: This function should accept a temporary area; dc_mullow_n + accepts a pointer tp, and handle the case tp == rp, do the same here. + Maybe recombine the two functions. + THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase + (typically thanks to mpn_addmul_2) should we unconditionally use + mpn_mul_n? +*/ + +void +mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n) +{ + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp, n)); + + if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD)) + { + /* Allocate workspace of fixed size on stack: fast! */ + mp_limb_t tp[MUL_BASECASE_ALLOC]; + mpn_mul_basecase (tp, xp, n, yp, n); + MPN_COPY (rp, tp, n); + } + else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD)) + { + mpn_mullo_basecase (rp, xp, yp, n); + } + else + { + mp_ptr tp; + TMP_DECL; + TMP_MARK; + tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n)); + if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD)) + { + mpn_dc_mullo_n (rp, xp, yp, n, tp); + } + else + { + /* For really large operands, use plain mpn_mul_n but throw away upper n + limbs of result. */ +#if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD) + mpn_fft_mul (tp, xp, n, yp, n); +#else + mpn_mul_n (tp, xp, yp, n); +#endif + MPN_COPY (rp, tp, n); + } + TMP_FREE; + } +} diff --git a/gmp-6.3.0/mpn/generic/mulmid.c b/gmp-6.3.0/mpn/generic/mulmid.c new file mode 100644 index 0000000..f35c5fb --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mulmid.c @@ -0,0 +1,255 @@ +/* mpn_mulmid -- middle product + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + +#define CHUNK (200 + MULMID_TOOM42_THRESHOLD) + + +void +mpn_mulmid (mp_ptr rp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn) +{ + mp_size_t rn, k; + mp_ptr scratch, temp; + + ASSERT (an >= bn); + ASSERT (bn >= 1); + ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, ap, an)); + ASSERT (! MPN_OVERLAP_P (rp, an - bn + 3, bp, bn)); + + if (bn < MULMID_TOOM42_THRESHOLD) + { + /* region not tall enough to make toom42 worthwhile for any portion */ + + if (an < CHUNK) + { + /* region not too wide either, just call basecase directly */ + mpn_mulmid_basecase (rp, ap, an, bp, bn); + return; + } + + /* Region quite wide. For better locality, use basecase on chunks: + + AAABBBCC.. + .AAABBBCC. + ..AAABBBCC + */ + + k = CHUNK - bn + 1; /* number of diagonals per chunk */ + + /* first chunk (marked A in the above diagram) */ + mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn); + + /* remaining chunks (B, C, etc) */ + an -= k; + + while (an >= CHUNK) + { + mp_limb_t t0, t1, cy; + ap += k, rp += k; + t0 = rp[0], t1 = rp[1]; + mpn_mulmid_basecase (rp, ap, CHUNK, bp, bn); + ADDC_LIMB (cy, rp[0], rp[0], t0); /* add back saved limbs */ + MPN_INCR_U (rp + 1, k + 1, t1 + cy); + an -= k; + } + + if (an >= bn) + { + /* last remaining chunk */ + mp_limb_t t0, t1, cy; + ap += k, rp += k; + t0 = rp[0], t1 = rp[1]; + mpn_mulmid_basecase (rp, ap, an, bp, bn); + ADDC_LIMB (cy, rp[0], rp[0], t0); + MPN_INCR_U (rp + 1, an - bn + 2, t1 + cy); + } + + return; + } + + /* region is tall enough for toom42 */ + + rn = an - bn + 1; + + if (rn < MULMID_TOOM42_THRESHOLD) + { + /* region not wide enough to make toom42 worthwhile for any portion */ + + TMP_DECL; + + if (bn < CHUNK) + { + /* region not too tall either, just call basecase directly */ + mpn_mulmid_basecase (rp, ap, an, bp, bn); + return; + } + + /* Region quite tall. For better locality, use basecase on chunks: + + AAAAA.... + .AAAAA... + ..BBBBB.. + ...BBBBB. + ....CCCCC + */ + + TMP_MARK; + + temp = TMP_ALLOC_LIMBS (rn + 2); + + /* first chunk (marked A in the above diagram) */ + bp += bn - CHUNK, an -= bn - CHUNK; + mpn_mulmid_basecase (rp, ap, an, bp, CHUNK); + + /* remaining chunks (B, C, etc) */ + bn -= CHUNK; + + while (bn >= CHUNK) + { + ap += CHUNK, bp -= CHUNK; + mpn_mulmid_basecase (temp, ap, an, bp, CHUNK); + mpn_add_n (rp, rp, temp, rn + 2); + bn -= CHUNK; + } + + if (bn) + { + /* last remaining chunk */ + ap += CHUNK, bp -= bn; + mpn_mulmid_basecase (temp, ap, rn + bn - 1, bp, bn); + mpn_add_n (rp, rp, temp, rn + 2); + } + + TMP_FREE; + return; + } + + /* we're definitely going to use toom42 somewhere */ + + if (bn > rn) + { + /* slice region into chunks, use toom42 on all chunks except possibly + the last: + + AA.... + .AA... + ..BB.. + ...BB. + ....CC + */ + + TMP_DECL; + TMP_MARK; + + temp = TMP_ALLOC_LIMBS (rn + 2 + mpn_toom42_mulmid_itch (rn)); + scratch = temp + rn + 2; + + /* first chunk (marked A in the above diagram) */ + bp += bn - rn; + mpn_toom42_mulmid (rp, ap, bp, rn, scratch); + + /* remaining chunks (B, C, etc) */ + bn -= rn; + + while (bn >= rn) + { + ap += rn, bp -= rn; + mpn_toom42_mulmid (temp, ap, bp, rn, scratch); + mpn_add_n (rp, rp, temp, rn + 2); + bn -= rn; + } + + if (bn) + { + /* last remaining chunk */ + ap += rn, bp -= bn; + mpn_mulmid (temp, ap, rn + bn - 1, bp, bn); + mpn_add_n (rp, rp, temp, rn + 2); + } + + TMP_FREE; + } + else + { + /* slice region into chunks, use toom42 on all chunks except possibly + the last: + + AAABBBCC.. + .AAABBBCC. + ..AAABBBCC + */ + + TMP_DECL; + TMP_MARK; + + scratch = TMP_ALLOC_LIMBS (mpn_toom42_mulmid_itch (bn)); + + /* first chunk (marked A in the above diagram) */ + mpn_toom42_mulmid (rp, ap, bp, bn, scratch); + + /* remaining chunks (B, C, etc) */ + rn -= bn; + + while (rn >= bn) + { + mp_limb_t t0, t1, cy; + ap += bn, rp += bn; + t0 = rp[0], t1 = rp[1]; + mpn_toom42_mulmid (rp, ap, bp, bn, scratch); + ADDC_LIMB (cy, rp[0], rp[0], t0); /* add back saved limbs */ + MPN_INCR_U (rp + 1, bn + 1, t1 + cy); + rn -= bn; + } + + TMP_FREE; + + if (rn) + { + /* last remaining chunk */ + mp_limb_t t0, t1, cy; + ap += bn, rp += bn; + t0 = rp[0], t1 = rp[1]; + mpn_mulmid (rp, ap, rn + bn - 1, bp, bn); + ADDC_LIMB (cy, rp[0], rp[0], t0); + MPN_INCR_U (rp + 1, rn + 1, t1 + cy); + } + } +} diff --git a/gmp-6.3.0/mpn/generic/mulmid_basecase.c b/gmp-6.3.0/mpn/generic/mulmid_basecase.c new file mode 100644 index 0000000..d5434ea --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mulmid_basecase.c @@ -0,0 +1,82 @@ +/* mpn_mulmid_basecase -- classical middle product algorithm + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + +/* Middle product of {up,un} and {vp,vn}, write result to {rp,un-vn+3}. + Must have un >= vn >= 1. + + Neither input buffer may overlap with the output buffer. */ + +void +mpn_mulmid_basecase (mp_ptr rp, + mp_srcptr up, mp_size_t un, + mp_srcptr vp, mp_size_t vn) +{ + mp_limb_t lo, hi; /* last two limbs of output */ + mp_limb_t cy; + + ASSERT (un >= vn); + ASSERT (vn >= 1); + ASSERT (! MPN_OVERLAP_P (rp, un - vn + 3, up, un)); + ASSERT (! MPN_OVERLAP_P (rp, un - vn + 3, vp, vn)); + + up += vn - 1; + un -= vn - 1; + + /* multiply by first limb, store result */ + lo = mpn_mul_1 (rp, up, un, vp[0]); + hi = 0; + + /* accumulate remaining rows */ + for (vn--; vn; vn--) + { + up--, vp++; + cy = mpn_addmul_1 (rp, up, un, vp[0]); + add_ssaaaa (hi, lo, hi, lo, CNST_LIMB(0), cy); + } + + /* store final limbs */ +#if GMP_NAIL_BITS != 0 + hi = (hi << GMP_NAIL_BITS) + (lo >> GMP_NUMB_BITS); + lo &= GMP_NUMB_MASK; +#endif + + rp[un] = lo; + rp[un + 1] = hi; +} diff --git a/gmp-6.3.0/mpn/generic/mulmid_n.c b/gmp-6.3.0/mpn/generic/mulmid_n.c new file mode 100644 index 0000000..ac7e8f1 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mulmid_n.c @@ -0,0 +1,61 @@ +/* mpn_mulmid_n -- balanced middle product + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + +void +mpn_mulmid_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) +{ + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1)); + ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n)); + + if (n < MULMID_TOOM42_THRESHOLD) + { + mpn_mulmid_basecase (rp, ap, 2*n - 1, bp, n); + } + else + { + mp_ptr scratch; + TMP_DECL; + TMP_MARK; + scratch = TMP_ALLOC_LIMBS (mpn_toom42_mulmid_itch (n)); + mpn_toom42_mulmid (rp, ap, bp, n, scratch); + TMP_FREE; + } +} diff --git a/gmp-6.3.0/mpn/generic/mulmod_bknp1.c b/gmp-6.3.0/mpn/generic/mulmod_bknp1.c new file mode 100644 index 0000000..feb10eb --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mulmod_bknp1.c @@ -0,0 +1,502 @@ +/* Mulptiplication mod B^n+1, for small operands. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2020-2022 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#ifndef MOD_BKNP1_USE11 +#define MOD_BKNP1_USE11 ((GMP_NUMB_BITS % 8 != 0) && (GMP_NUMB_BITS % 2 == 0)) +#endif +#ifndef MOD_BKNP1_ONLY3 +#define MOD_BKNP1_ONLY3 0 +#endif + +/* {rp, (k - 1) * n} = {op, k * n + 1} % (B^{k*n}+1) / (B^n+1) */ +static void +_mpn_modbknp1dbnp1_n (mp_ptr rp, mp_srcptr op, mp_size_t n, unsigned k) +{ + mp_limb_t hl; + mp_srcptr hp; + unsigned i; + +#if MOD_BKNP1_ONLY3 + ASSERT (k == 3); + k = 3; +#endif + ASSERT (k > 2); + ASSERT (k % 2 == 1); + + --k; + + rp += k * n; + op += k * n; + hp = op; + hl = hp[n]; /* initial op[k*n]. */ + ASSERT (hl < GMP_NUMB_MAX - 1); + +#if MOD_BKNP1_ONLY3 == 0 + /* The first MPN_INCR_U (rp + n, 1, cy); in the loop should be + rp[n] = cy; */ + *rp = 0; +#endif + + i = k >> 1; + do + { + mp_limb_t cy, bw; + rp -= n; + op -= n; + cy = hl + mpn_add_n (rp, op, hp, n); +#if MOD_BKNP1_ONLY3 + rp[n] = cy; +#else + MPN_INCR_U (rp + n, (k - i * 2) * n + 1, cy); +#endif + rp -= n; + op -= n; + bw = hl + mpn_sub_n (rp, op, hp, n); + MPN_DECR_U (rp + n, (k - i * 2 + 1) * n + 1, bw); + } + while (--i != 0); + + for (; (hl = *(rp += k * n)) != 0; ) /* Should run only once... */ + { + *rp = 0; + i = k >> 1; + do + { + rp -= n; + MPN_INCR_U (rp, (k - i * 2 + 1) * n + 1, hl); + rp -= n; + MPN_DECR_U (rp, (k - i * 2 + 2) * n + 1, hl); + } + while (--i != 0); + } +} + +static void +_mpn_modbnp1_pn_ip (mp_ptr r, mp_size_t n, mp_limb_t h) +{ + ASSERT (r[n] == h); + + /* Fully normalise */ + MPN_DECR_U (r, n + 1, h); + h -= r[n]; + r[n] = 0; + MPN_INCR_U (r, n + 1, h); +} + +static void +_mpn_modbnp1_neg_ip (mp_ptr r, mp_size_t n, mp_limb_t h) +{ + r[n] = 0; + MPN_INCR_U (r, n + 1, -h); + if (UNLIKELY (r[n] != 0)) + _mpn_modbnp1_pn_ip (r, n, 1); +} + +static void +_mpn_modbnp1_nc_ip (mp_ptr r, mp_size_t n, mp_limb_t h) +{ + if (h & GMP_NUMB_HIGHBIT) /* This means h < 0 */ + { + _mpn_modbnp1_neg_ip (r, n, h); + } + else + { + r[n] = h; + if (h) + _mpn_modbnp1_pn_ip(r, n, h); + } +} + +/* {rp, rn + 1} = {op, on} mod (B^{rn}+1) */ +/* Used when rn < on < 2*rn. */ +static void +_mpn_modbnp1 (mp_ptr rp, mp_size_t rn, mp_srcptr op, mp_size_t on) +{ + mp_limb_t bw; + +#if 0 + if (UNLIKELY (on <= rn)) + { + MPN_COPY (rp, op, on); + MPN_ZERO (rp + on, rn - on); + return; + } +#endif + + ASSERT (on > rn); + ASSERT (on <= 2 * rn); + + bw = mpn_sub (rp, op, rn, op + rn, on - rn); + rp[rn] = 0; + MPN_INCR_U (rp, rn + 1, bw); +} + +/* {rp, rn + 1} = {op, k * rn + 1} % (B^{rn}+1) */ +/* With odd k >= 3. */ +static void +_mpn_modbnp1_kn (mp_ptr rp, mp_srcptr op, mp_size_t rn, unsigned k) +{ + mp_limb_t cy; + +#if MOD_BKNP1_ONLY3 + ASSERT (k == 3); + k = 3; +#endif + ASSERT (k & 1); + k >>= 1; + ASSERT (0 < k && k < GMP_NUMB_HIGHBIT - 3); + ASSERT (op[(1 + 2 * k) * rn] < GMP_NUMB_HIGHBIT - 2 - k); + + cy = - mpn_sub_n (rp, op, op + rn, rn); + for (;;) { + op += 2 * rn; + cy += mpn_add_n (rp, rp, op, rn); + if (--k == 0) + break; + cy -= mpn_sub_n (rp, rp, op + rn, rn); + }; + + cy += op[rn]; + _mpn_modbnp1_nc_ip (rp, rn, cy); +} + +/* For the various mpn_divexact_byN here, fall back to using either + mpn_pi1_bdiv_q_1 or mpn_divexact_1. The former has less overhead and is + faster if it is native. For now, since mpn_divexact_1 is native on + platforms where mpn_pi1_bdiv_q_1 does not yet exist, do not use + mpn_pi1_bdiv_q_1 unconditionally. FIXME. */ + +#ifndef mpn_divexact_by5 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define BINVERT_5 \ + ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 5 * 3 << 3) + 5) & GMP_NUMB_MAX) +#define mpn_divexact_by5(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,5,BINVERT_5,0) +#else +#define mpn_divexact_by5(dst,src,size) mpn_divexact_1(dst,src,size,5) +#endif +#endif + +#ifndef mpn_divexact_by7 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define BINVERT_7 \ + ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 3)) / 7 * 3 << 4) + 7) & GMP_NUMB_MAX) +#define mpn_divexact_by7(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,7,BINVERT_7,0) +#else +#define mpn_divexact_by7(dst,src,size) mpn_divexact_1(dst,src,size,7) +#endif +#endif + +#ifndef mpn_divexact_by11 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define BINVERT_11 \ + ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 10)) / 11 << 5) + 3) & GMP_NUMB_MAX) +#define mpn_divexact_by11(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,11,BINVERT_11,0) +#else +#define mpn_divexact_by11(dst,src,size) mpn_divexact_1(dst,src,size,11) +#endif +#endif + +#ifndef mpn_divexact_by13 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define BINVERT_13 \ + ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 12)) / 13 * 3 << 14) + 3781) & GMP_NUMB_MAX) +#define mpn_divexact_by13(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,13,BINVERT_13,0) +#else +#define mpn_divexact_by13(dst,src,size) mpn_divexact_1(dst,src,size,13) +#endif +#endif + +#ifndef mpn_divexact_by17 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define BINVERT_17 \ + ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 8)) / 17 * 15 << 7) + 113) & GMP_NUMB_MAX) +#define mpn_divexact_by17(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,17,BINVERT_17,0) +#else +#define mpn_divexact_by17(dst,src,size) mpn_divexact_1(dst,src,size,17) +#endif +#endif + +/* Thanks to Chinese remainder theorem, store + in {rp, k*n+1} the value mod (B^(k*n)+1), given + {ap, k*n+1} mod ((B^(k*n)+1)/(B^n+1)) and + {bp, n+1} mod (B^n+1) . + {tp, n+1} is a scratch area. + tp == rp or rp == ap are possible. +*/ +static void +_mpn_crt (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, + mp_size_t n, unsigned k, mp_ptr tp) +{ + mp_limb_t mod; + unsigned i; + +#if MOD_BKNP1_ONLY3 + ASSERT (k == 3); + k = 3; +#endif + _mpn_modbnp1_kn (tp, ap, n, k); + if (mpn_sub_n (tp, bp, tp, n + 1)) + _mpn_modbnp1_neg_ip (tp, n, tp[n]); + +#if MOD_BKNP1_USE11 + if (UNLIKELY (k == 11)) + { + ASSERT (GMP_NUMB_BITS % 2 == 0); + /* mod <- -Mod(B^n+1,11)^-1 */ + mod = n * (GMP_NUMB_BITS % 5) % 5; + if ((mod > 2) || UNLIKELY (mod == 0)) + mod += 5; + + mod *= mpn_mod_1 (tp, n + 1, 11); + } + else +#endif + { +#if GMP_NUMB_BITS % 8 == 0 + /* (2^6 - 1) | (2^{GMP_NUMB_BITS*3/4} - 1) */ + /* (2^6 - 1) = 3^2 * 7 */ + mod = mpn_mod_34lsub1 (tp, n + 1); + ASSERT ((GMP_NUMB_MAX >> (GMP_NUMB_BITS >> 2)) % k == 0); + /* (2^12 - 1) = 3^2 * 5 * 7 * 13 */ + /* (2^24 - 1) = 3^2 * 5 * 7 * 13 * 17 * 241 */ + ASSERT (k == 3 || k == 5 || k == 7 || k == 13 || k == 17); + +#if GMP_NUMB_BITS % 3 != 0 + if (UNLIKELY (k != 3)) + { + ASSERT ((GMP_NUMB_MAX % k == 0) || (n % 3 != 0)); + if ((GMP_NUMB_BITS % 16 == 0) && LIKELY (k == 5)) + mod <<= 1; /* k >> 1 = 1 << 1 */ + else if ((GMP_NUMB_BITS % 16 != 0) || LIKELY (k == 7)) + mod <<= (n << (GMP_NUMB_BITS % 3 >> 1)) % 3; + else if ((GMP_NUMB_BITS % 32 != 0) || LIKELY (k == 13)) + mod *= ((n << (GMP_NUMB_BITS % 3 >> 1)) % 3 == 1) ? 3 : 9; + else /* k == 17 */ + mod <<= 3; /* k >> 1 = 1 << 3 */ +#if 0 + if ((GMP_NUMB_BITS == 8) /* && (k == 7) */ || + (GMP_NUMB_BITS == 16) && (k == 13)) + mod = ((mod & (GMP_NUMB_MAX >> (GMP_NUMB_BITS >> 2))) + + (mod >> (3 * GMP_NUMB_BITS >> 2))); +#endif + } +#else + ASSERT (GMP_NUMB_MAX % k == 0); + /* 2^{GMP_NUMB_BITS} - 1 = 0 (mod k) */ + /* 2^{GMP_NUMB_BITS} = 1 (mod k) */ + /* 2^{n*GMP_NUMB_BITS} + 1 = 2 (mod k) */ + /* -2^{-1} = k >> 1 (mod k) */ + mod *= k >> 1; +#endif +#else + ASSERT_ALWAYS (k == 0); /* Not implemented, should not be used. */ +#endif + } + + MPN_INCR_U (tp, n + 1, mod); + tp[n] += mod; + + if (LIKELY (k == 3)) + ASSERT_NOCARRY (mpn_divexact_by3 (tp, tp, n + 1)); + else if ((GMP_NUMB_BITS % 16 == 0) && LIKELY (k == 5)) + mpn_divexact_by5 (tp, tp, n + 1); + else if (((! MOD_BKNP1_USE11) && (GMP_NUMB_BITS % 16 != 0)) + || LIKELY (k == 7)) + mpn_divexact_by7 (tp, tp, n + 1); +#if MOD_BKNP1_USE11 + else if (k == 11) + mpn_divexact_by11 (tp, tp, n + 1); +#endif + else if ((GMP_NUMB_BITS % 32 != 0) || LIKELY (k == 13)) + mpn_divexact_by13 (tp, tp, n + 1); + else /* (k == 17) */ + mpn_divexact_by17 (tp, tp, n + 1); + + rp += k * n; + ap += k * n; /* tp - 1 */ + + rp -= n; + ap -= n; + ASSERT_NOCARRY (mpn_add_n (rp, ap, tp, n + 1)); + + i = k >> 1; + do + { + mp_limb_t cy, bw; + rp -= n; + ap -= n; + bw = mpn_sub_n (rp, ap, tp, n) + tp[n]; + MPN_DECR_U (rp + n, (k - i * 2) * n + 1, bw); + rp -= n; + ap -= n; + cy = mpn_add_n (rp, ap, tp, n) + tp[n]; + MPN_INCR_U (rp + n, (k - i * 2 + 1) * n + 1, cy); + } + while (--i != 0); + + /* if (LIKELY (rp[k * n])) */ + _mpn_modbnp1_pn_ip (rp, k * n, rp[k * n]); +} + + +static void +_mpn_mulmod_bnp1_tp (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn, + mp_ptr tp) +{ + mp_limb_t cy; + unsigned k; + + ASSERT (0 < rn); + ASSERT ((ap[rn] | bp[rn]) <= 1); + + if (UNLIKELY (ap[rn] | bp[rn])) + { + if (ap[rn]) + cy = bp[rn] + mpn_neg (rp, bp, rn); + else /* ap[rn] == 0 */ + cy = mpn_neg (rp, ap, rn); + } + else if (MPN_MULMOD_BKNP1_USABLE(rn, k, MUL_FFT_MODF_THRESHOLD / 3)) + { + rn /= k; + mpn_mulmod_bknp1 (rp, ap, bp, rn, k, tp); + return; + } + else + { + mpn_mul_n (tp, ap, bp, rn); + cy = mpn_sub_n (rp, tp, tp + rn, rn); + } + rp[rn] = 0; + MPN_INCR_U (rp, rn + 1, cy); +} + +/* {rp, kn + 1} = {ap, kn + 1} * {bp, kn + 1} % (B^kn + 1) */ +/* tp must point to at least 4*(k-1)*n+1 limbs*/ +void +mpn_mulmod_bknp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, + mp_size_t n, unsigned k, mp_ptr tp) +{ + mp_ptr hp; + +#if MOD_BKNP1_ONLY3 + ASSERT (k == 3); + k = 3; +#endif + ASSERT (k > 2); + ASSERT (k % 2 == 1); + + /* a % (B^{nn}+1)/(B^{nn/k}+1) */ + _mpn_modbknp1dbnp1_n (tp + (k - 1) * n * 2, ap, n, k); + /* b % (B^{nn}+1)/(B^{nn/k}+1) */ + _mpn_modbknp1dbnp1_n (tp + (k - 1) * n * 3, bp, n, k); + mpn_mul_n (tp, tp + (k - 1) * n * 2, tp + (k - 1) * n * 3, (k - 1) * n); + _mpn_modbnp1 (tp, k * n, tp, (k - 1) * n * 2); + + hp = tp + k * n + 1; + /* a % (B^{nn/k}+1) */ + ASSERT (ap[k * n] <= 1); + _mpn_modbnp1_kn (hp, ap, n, k); + /* b % (B^{nn/k}+1) */ + ASSERT (bp[k * n] <= 1); + _mpn_modbnp1_kn (hp + n + 1, bp, n, k); + _mpn_mulmod_bnp1_tp (hp + (n + 1) * 2, hp, hp + n + 1, n, hp + (n + 1) * 2); + + _mpn_crt (rp, tp, hp + (n + 1) * 2, n, k, hp); +} + + +static void +_mpn_sqrmod_bnp1_tp (mp_ptr rp, mp_srcptr ap, mp_size_t rn, + mp_ptr tp) +{ + mp_limb_t cy; + unsigned k; + + ASSERT (0 < rn); + + if (UNLIKELY (ap[rn])) + { + ASSERT (ap[rn] == 1); + *rp = 1; + MPN_FILL (rp + 1, rn, 0); + return; + } + else if (MPN_SQRMOD_BKNP1_USABLE(rn, k, MUL_FFT_MODF_THRESHOLD / 3)) + { + rn /= k; + mpn_sqrmod_bknp1 (rp, ap, rn, k, tp); + return; + } + else + { + mpn_sqr (tp, ap, rn); + cy = mpn_sub_n (rp, tp, tp + rn, rn); + } + rp[rn] = 0; + MPN_INCR_U (rp, rn + 1, cy); +} + +/* {rp, kn + 1} = {ap, kn + 1}^2 % (B^kn + 1) */ +/* tp must point to at least 3*(k-1)*n+1 limbs*/ +void +mpn_sqrmod_bknp1 (mp_ptr rp, mp_srcptr ap, + mp_size_t n, unsigned k, mp_ptr tp) +{ + mp_ptr hp; + +#if MOD_BKNP1_ONLY3 + ASSERT (k == 3); + k = 3; +#endif + ASSERT (k > 2); + ASSERT (k % 2 == 1); + + /* a % (B^{nn}+1)/(B^{nn/k}+1) */ + _mpn_modbknp1dbnp1_n (tp + (k - 1) * n * 2, ap, n, k); + mpn_sqr (tp, tp + (k - 1) * n * 2, (k - 1) * n); + _mpn_modbnp1 (tp, k * n, tp, (k - 1) * n * 2); + + hp = tp + k * n + 1; + /* a % (B^{nn/k}+1) */ + ASSERT (ap[k * n] <= 1); + _mpn_modbnp1_kn (hp, ap, n, k); + _mpn_sqrmod_bnp1_tp (hp + (n + 1), hp, n, hp + (n + 1)); + + _mpn_crt (rp, tp, hp + (n + 1), n, k, hp); +} diff --git a/gmp-6.3.0/mpn/generic/mulmod_bnm1.c b/gmp-6.3.0/mpn/generic/mulmod_bnm1.c new file mode 100644 index 0000000..8229ede --- /dev/null +++ b/gmp-6.3.0/mpn/generic/mulmod_bnm1.c @@ -0,0 +1,374 @@ +/* mulmod_bnm1.c -- multiplication mod B^n-1. + + Contributed to the GNU project by Niels Möller, Torbjorn Granlund and + Marco Bodrato. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2010, 2012, 2013, 2020, 2022 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + +/* Inputs are {ap,rn} and {bp,rn}; output is {rp,rn}, computation is + mod B^rn - 1, and values are semi-normalised; zero is represented + as either 0 or B^n - 1. Needs a scratch of 2rn limbs at tp. + tp==rp is allowed. */ +void +mpn_bc_mulmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn, + mp_ptr tp) +{ + mp_limb_t cy; + + ASSERT (0 < rn); + + mpn_mul_n (tp, ap, bp, rn); + cy = mpn_add_n (rp, tp, tp + rn, rn); + /* If cy == 1, then the value of rp is at most B^rn - 2, so there can + * be no overflow when adding in the carry. */ + MPN_INCR_U (rp, rn, cy); +} + + +/* Inputs are {ap,rn+1} and {bp,rn+1}; output is {rp,rn+1}, in + normalised representation, computation is mod B^rn + 1. Needs + a scratch area of 2rn limbs at tp; tp == rp is allowed. + Output is normalised. */ +static void +mpn_bc_mulmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn, + mp_ptr tp) +{ + mp_limb_t cy; + unsigned k; + + ASSERT (0 < rn); + + if (UNLIKELY (ap[rn] | bp [rn])) + { + if (ap[rn]) + cy = bp [rn] + mpn_neg (rp, bp, rn); + else /* ap[rn] == 0 */ + cy = mpn_neg (rp, ap, rn); + } + else if (MPN_MULMOD_BKNP1_USABLE (rn, k, MUL_FFT_MODF_THRESHOLD)) + { + mp_size_t n_k = rn / k; + TMP_DECL; + + TMP_MARK; + mpn_mulmod_bknp1 (rp, ap, bp, n_k, k, + TMP_ALLOC_LIMBS (mpn_mulmod_bknp1_itch (rn))); + TMP_FREE; + return; + } + else + { + mpn_mul_n (tp, ap, bp, rn); + cy = mpn_sub_n (rp, tp, tp + rn, rn); + } + rp[rn] = 0; + MPN_INCR_U (rp, rn + 1, cy); +} + + +/* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1) + * + * The result is expected to be ZERO if and only if one of the operand + * already is. Otherwise the class [0] Mod(B^rn-1) is represented by + * B^rn-1. This should not be a problem if mulmod_bnm1 is used to + * combine results and obtain a natural number when one knows in + * advance that the final value is less than (B^rn-1). + * Moreover it should not be a problem if mulmod_bnm1 is used to + * compute the full product with an+bn <= rn, because this condition + * implies (B^an-1)(B^bn-1) < (B^rn-1) . + * + * Requires 0 < bn <= an <= rn and an + bn > rn/2 + * Scratch need: rn + (need for recursive call OR rn + 4). This gives + * + * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4 + */ +void +mpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp) +{ + ASSERT (0 < bn); + ASSERT (bn <= an); + ASSERT (an <= rn); + + if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD)) + { + if (UNLIKELY (bn < rn)) + { + if (UNLIKELY (an + bn <= rn)) + { + mpn_mul (rp, ap, an, bp, bn); + } + else + { + mp_limb_t cy; + mpn_mul (tp, ap, an, bp, bn); + cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn); + MPN_INCR_U (rp, rn, cy); + } + } + else + mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp); + } + else + { + mp_size_t n; + mp_limb_t cy; + mp_limb_t hi; + + n = rn >> 1; + + /* We need at least an + bn >= n, to be able to fit one of the + recursive products at rp. Requiring strict inequality makes + the code slightly simpler. If desired, we could avoid this + restriction by initially halving rn as long as rn is even and + an + bn <= rn/2. */ + + ASSERT (an + bn > n); + + /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1) + and crt together as + + x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)] + */ + +#define a0 ap +#define a1 (ap + n) +#define b0 bp +#define b1 (bp + n) + +#define xp tp /* 2n + 2 */ + /* am1 maybe in {xp, n} */ + /* bm1 maybe in {xp + n, n} */ +#define sp1 (tp + 2*n + 2) + /* ap1 maybe in {sp1, n + 1} */ + /* bp1 maybe in {sp1 + n + 1, n + 1} */ + + { + mp_srcptr am1, bm1; + mp_size_t anm, bnm; + mp_ptr so; + + bm1 = b0; + bnm = bn; + if (LIKELY (an > n)) + { + am1 = xp; + cy = mpn_add (xp, a0, n, a1, an - n); + MPN_INCR_U (xp, n, cy); + anm = n; + so = xp + n; + if (LIKELY (bn > n)) + { + bm1 = so; + cy = mpn_add (so, b0, n, b1, bn - n); + MPN_INCR_U (so, n, cy); + bnm = n; + so += n; + } + } + else + { + so = xp; + am1 = a0; + anm = an; + } + + mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so); + } + + { + int k; + mp_srcptr ap1, bp1; + mp_size_t anp, bnp; + + bp1 = b0; + bnp = bn; + if (LIKELY (an > n)) { + ap1 = sp1; + cy = mpn_sub (sp1, a0, n, a1, an - n); + sp1[n] = 0; + MPN_INCR_U (sp1, n + 1, cy); + anp = n + ap1[n]; + if (LIKELY (bn > n)) { + bp1 = sp1 + n + 1; + cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n); + sp1[2*n+1] = 0; + MPN_INCR_U (sp1 + n + 1, n + 1, cy); + bnp = n + bp1[n]; + } + } else { + ap1 = a0; + anp = an; + } + + if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD)) + k=0; + else + { + int mask; + k = mpn_fft_best_k (n, 0); + mask = (1<>=1;}; + } + if (k >= FFT_FIRST_K) + xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k); + else if (UNLIKELY (bp1 == b0)) + { + ASSERT (anp + bnp <= 2*n+1); + ASSERT (anp + bnp > n); + ASSERT (anp >= bnp); + mpn_mul (xp, ap1, anp, bp1, bnp); + anp = anp + bnp - n; + ASSERT (anp <= n || xp[2*n]==0); + anp-= anp > n; + cy = mpn_sub (xp, xp, n, xp + n, anp); + xp[n] = 0; + MPN_INCR_U (xp, n+1, cy); + } + else + mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp); + } + + /* Here the CRT recomposition begins. + + xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1) + Division by 2 is a bitwise rotation. + + Assumes xp normalised mod (B^n+1). + + The residue class [0] is represented by [B^n-1]; except when + both input are ZERO. + */ + +#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc +#if HAVE_NATIVE_mpn_rsh1add_nc + cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */ + hi = cy << (GMP_NUMB_BITS - 1); + cy = 0; + /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi + overflows, i.e. a further increment will not overflow again. */ +#else /* ! _nc */ + cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */ + hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */ + cy >>= 1; + /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that + the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */ +#endif +#if GMP_NAIL_BITS == 0 + add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi); +#else + cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1); + rp[n-1] ^= hi; +#endif +#else /* ! HAVE_NATIVE_mpn_rsh1add_n */ +#if HAVE_NATIVE_mpn_add_nc + cy = mpn_add_nc(rp, rp, xp, n, xp[n]); +#else /* ! _nc */ + cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */ +#endif + cy += (rp[0]&1); + mpn_rshift(rp, rp, n, 1); + ASSERT (cy <= 2); + hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */ + cy >>= 1; + /* We can have cy != 0 only if hi = 0... */ + ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0); + rp[n-1] |= hi; + /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */ +#endif + ASSERT (cy <= 1); + /* Next increment can not overflow, read the previous comments about cy. */ + ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0)); + MPN_INCR_U(rp, n, cy); + + /* Compute the highest half: + ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n + */ + if (UNLIKELY (an + bn < rn)) + { + /* Note that in this case, the only way the result can equal + zero mod B^{rn} - 1 is if one of the inputs is zero, and + then the output of both the recursive calls and this CRT + reconstruction is zero, not B^{rn} - 1. Which is good, + since the latter representation doesn't fit in the output + area.*/ + cy = mpn_sub_n (rp + n, rp, xp, an + bn - n); + + /* FIXME: This subtraction of the high parts is not really + necessary, we do it to get the carry out, and for sanity + checking. */ + cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n, + xp + an + bn - n, rn - (an + bn), cy); + ASSERT (an + bn == rn - 1 || + mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn))); + cy = mpn_sub_1 (rp, rp, an + bn, cy); + ASSERT (cy == (xp + an + bn - n)[0]); + } + else + { + cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n); + /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO. + DECR will affect _at most_ the lowest n limbs. */ + MPN_DECR_U (rp, 2*n, cy); + } +#undef a0 +#undef a1 +#undef b0 +#undef b1 +#undef xp +#undef sp1 + } +} + +mp_size_t +mpn_mulmod_bnm1_next_size (mp_size_t n) +{ + mp_size_t nh; + + if (BELOW_THRESHOLD (n, MULMOD_BNM1_THRESHOLD)) + return n; + if (BELOW_THRESHOLD (n, 4 * (MULMOD_BNM1_THRESHOLD - 1) + 1)) + return (n + (2-1)) & (-2); + if (BELOW_THRESHOLD (n, 8 * (MULMOD_BNM1_THRESHOLD - 1) + 1)) + return (n + (4-1)) & (-4); + + nh = (n + 1) >> 1; + + if (BELOW_THRESHOLD (nh, MUL_FFT_MODF_THRESHOLD)) + return (n + (8-1)) & (-8); + + return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 0)); +} diff --git a/gmp-6.3.0/mpn/generic/neg.c b/gmp-6.3.0/mpn/generic/neg.c new file mode 100644 index 0000000..bec2a32 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/neg.c @@ -0,0 +1,33 @@ +/* mpn_neg - negate an mpn. + +Copyright 2001, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __GMP_FORCE_mpn_neg 1 + +#include "gmp-impl.h" diff --git a/gmp-6.3.0/mpn/generic/nussbaumer_mul.c b/gmp-6.3.0/mpn/generic/nussbaumer_mul.c new file mode 100644 index 0000000..3e0cf27 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/nussbaumer_mul.c @@ -0,0 +1,70 @@ +/* mpn_nussbaumer_mul -- Multiply {ap,an} and {bp,bn} using + Nussbaumer's negacyclic convolution. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Multiply {ap,an} by {bp,bn}, and put the result in {pp, an+bn} */ +void +mpn_nussbaumer_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn) +{ + mp_size_t rn; + mp_ptr tp; + TMP_DECL; + + ASSERT (an >= bn); + ASSERT (bn > 0); + + TMP_MARK; + + if ((ap == bp) && (an == bn)) + { + rn = mpn_sqrmod_bnm1_next_size (2*an); + tp = TMP_ALLOC_LIMBS (mpn_sqrmod_bnm1_itch (rn, an)); + mpn_sqrmod_bnm1 (pp, rn, ap, an, tp); + } + else + { + rn = mpn_mulmod_bnm1_next_size (an + bn); + tp = TMP_ALLOC_LIMBS (mpn_mulmod_bnm1_itch (rn, an, bn)); + mpn_mulmod_bnm1 (pp, rn, ap, an, bp, bn, tp); + } + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/perfpow.c b/gmp-6.3.0/mpn/generic/perfpow.c new file mode 100644 index 0000000..9d46477 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/perfpow.c @@ -0,0 +1,342 @@ +/* mpn_perfect_power_p -- mpn perfect power detection. + + Contributed to the GNU project by Martin Boij. + +Copyright 2009, 2010, 2012, 2014 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#define SMALL 20 +#define MEDIUM 100 + +/* Return non-zero if {np,nn} == {xp,xn} ^ k. + Algorithm: + For s = 1, 2, 4, ..., s_max, compute the s least significant limbs of + {xp,xn}^k. Stop if they don't match the s least significant limbs of + {np,nn}. + + FIXME: Low xn limbs can be expected to always match, if computed as a mod + B^{xn} root. So instead of using mpn_powlo, compute an approximation of the + most significant (normalized) limb of {xp,xn} ^ k (and an error bound), and + compare to {np, nn}. Or use an even cruder approximation based on fix-point + base 2 logarithm. */ +static int +pow_equals (mp_srcptr np, mp_size_t n, + mp_srcptr xp,mp_size_t xn, + mp_limb_t k, mp_bitcnt_t f, + mp_ptr tp) +{ + mp_bitcnt_t y, z; + mp_size_t bn; + mp_limb_t h, l; + + ASSERT (n > 1 || (n == 1 && np[0] > 1)); + ASSERT (np[n - 1] > 0); + ASSERT (xn > 0); + + if (xn == 1 && xp[0] == 1) + return 0; + + z = 1 + (n >> 1); + for (bn = 1; bn < z; bn <<= 1) + { + mpn_powlo (tp, xp, &k, 1, bn, tp + bn); + if (mpn_cmp (tp, np, bn) != 0) + return 0; + } + + /* Final check. Estimate the size of {xp,xn}^k before computing the power + with full precision. Optimization: It might pay off to make a more + accurate estimation of the logarithm of {xp,xn}, rather than using the + index of the MSB. */ + + MPN_SIZEINBASE_2EXP(y, xp, xn, 1); + y -= 1; /* msb_index (xp, xn) */ + + umul_ppmm (h, l, k, y); + h -= l == 0; --l; /* two-limb decrement */ + + z = f - 1; /* msb_index (np, n) */ + if (h == 0 && l <= z) + { + mp_limb_t *tp2; + mp_size_t i; + int ans; + mp_limb_t size; + TMP_DECL; + + size = l + k; + ASSERT_ALWAYS (size >= k); + + TMP_MARK; + y = 2 + size / GMP_LIMB_BITS; + tp2 = TMP_ALLOC_LIMBS (y); + + i = mpn_pow_1 (tp, xp, xn, k, tp2); + if (i == n && mpn_cmp (tp, np, n) == 0) + ans = 1; + else + ans = 0; + TMP_FREE; + return ans; + } + + return 0; +} + + +/* Return non-zero if N = {np,n} is a kth power. + I = {ip,n} = N^(-1) mod B^n. */ +static int +is_kth_power (mp_ptr rp, mp_srcptr np, + mp_limb_t k, mp_srcptr ip, + mp_size_t n, mp_bitcnt_t f, + mp_ptr tp) +{ + mp_bitcnt_t b; + mp_size_t rn, xn; + + ASSERT (n > 0); + ASSERT ((k & 1) != 0 || k == 2); + ASSERT ((np[0] & 1) != 0); + + if (k == 2) + { + b = (f + 1) >> 1; + rn = 1 + b / GMP_LIMB_BITS; + if (mpn_bsqrtinv (rp, ip, b, tp) != 0) + { + rp[rn - 1] &= (CNST_LIMB(1) << (b % GMP_LIMB_BITS)) - 1; + xn = rn; + MPN_NORMALIZE (rp, xn); + if (pow_equals (np, n, rp, xn, k, f, tp) != 0) + return 1; + + /* Check if (2^b - r)^2 == n */ + mpn_neg (rp, rp, rn); + rp[rn - 1] &= (CNST_LIMB(1) << (b % GMP_LIMB_BITS)) - 1; + MPN_NORMALIZE (rp, rn); + if (pow_equals (np, n, rp, rn, k, f, tp) != 0) + return 1; + } + } + else + { + b = 1 + (f - 1) / k; + rn = 1 + (b - 1) / GMP_LIMB_BITS; + mpn_brootinv (rp, ip, rn, k, tp); + if ((b % GMP_LIMB_BITS) != 0) + rp[rn - 1] &= (CNST_LIMB(1) << (b % GMP_LIMB_BITS)) - 1; + MPN_NORMALIZE (rp, rn); + if (pow_equals (np, n, rp, rn, k, f, tp) != 0) + return 1; + } + MPN_ZERO (rp, rn); /* Untrash rp */ + return 0; +} + +static int +perfpow (mp_srcptr np, mp_size_t n, + mp_limb_t ub, mp_limb_t g, + mp_bitcnt_t f, int neg) +{ + mp_ptr ip, tp, rp; + mp_limb_t k; + int ans; + mp_bitcnt_t b; + gmp_primesieve_t ps; + TMP_DECL; + + ASSERT (n > 0); + ASSERT ((np[0] & 1) != 0); + ASSERT (ub > 0); + + TMP_MARK; + gmp_init_primesieve (&ps); + b = (f + 3) >> 1; + + TMP_ALLOC_LIMBS_3 (ip, n, rp, n, tp, 5 * n); + + MPN_ZERO (rp, n); + + /* FIXME: It seems the inverse in ninv is needed only to get non-inverted + roots. I.e., is_kth_power computes n^{1/2} as (n^{-1})^{-1/2} and + similarly for nth roots. It should be more efficient to compute n^{1/2} as + n * n^{-1/2}, with a mullo instead of a binvert. And we can do something + similar for kth roots if we switch to an iteration converging to n^{1/k - + 1}, and we can then eliminate this binvert call. */ + mpn_binvert (ip, np, 1 + (b - 1) / GMP_LIMB_BITS, tp); + if (b % GMP_LIMB_BITS) + ip[(b - 1) / GMP_LIMB_BITS] &= (CNST_LIMB(1) << (b % GMP_LIMB_BITS)) - 1; + + if (neg) + gmp_nextprime (&ps); + + ans = 0; + if (g > 0) + { + ub = MIN (ub, g + 1); + while ((k = gmp_nextprime (&ps)) < ub) + { + if ((g % k) == 0) + { + if (is_kth_power (rp, np, k, ip, n, f, tp) != 0) + { + ans = 1; + goto ret; + } + } + } + } + else + { + while ((k = gmp_nextprime (&ps)) < ub) + { + if (is_kth_power (rp, np, k, ip, n, f, tp) != 0) + { + ans = 1; + goto ret; + } + } + } + ret: + TMP_FREE; + return ans; +} + +static const unsigned short nrtrial[] = { 100, 500, 1000 }; + +/* Table of (log_{p_i} 2) values, where p_i is the (nrtrial[i] + 1)'th prime + number. */ +static const double logs[] = + { 0.1099457228193620, 0.0847016403115322, 0.0772048195144415 }; + +int +mpn_perfect_power_p (mp_srcptr np, mp_size_t n) +{ + mp_limb_t *nc, factor, g; + mp_limb_t exp, d; + mp_bitcnt_t twos, count; + int ans, where, neg, trial; + TMP_DECL; + + neg = n < 0; + if (neg) + { + n = -n; + } + + if (n == 0 || (n == 1 && np[0] == 1)) /* Valgrind doesn't like + (n <= (np[0] == 1)) */ + return 1; + + TMP_MARK; + + count = 0; + + twos = mpn_scan1 (np, 0); + if (twos != 0) + { + mp_size_t s; + if (twos == 1) + { + return 0; + } + s = twos / GMP_LIMB_BITS; + if (s + 1 == n && POW2_P (np[s])) + { + return ! (neg && POW2_P (twos)); + } + count = twos % GMP_LIMB_BITS; + n -= s; + np += s; + if (count > 0) + { + nc = TMP_ALLOC_LIMBS (n); + mpn_rshift (nc, np, n, count); + n -= (nc[n - 1] == 0); + np = nc; + } + } + g = twos; + + trial = (n > SMALL) + (n > MEDIUM); + + where = 0; + factor = mpn_trialdiv (np, n, nrtrial[trial], &where); + + if (factor != 0) + { + if (count == 0) /* We did not allocate nc yet. */ + { + nc = TMP_ALLOC_LIMBS (n); + } + + /* Remove factors found by trialdiv. Optimization: If remove + define _itch, we can allocate its scratch just once */ + + do + { + binvert_limb (d, factor); + + /* After the first round we always have nc == np */ + exp = mpn_remove (nc, &n, np, n, &d, 1, ~(mp_bitcnt_t)0); + + if (g == 0) + g = exp; + else + g = mpn_gcd_1 (&g, 1, exp); + + if (g == 1) + { + ans = 0; + goto ret; + } + + if ((n == 1) & (nc[0] == 1)) + { + ans = ! (neg && POW2_P (g)); + goto ret; + } + + np = nc; + factor = mpn_trialdiv (np, n, nrtrial[trial], &where); + } + while (factor != 0); + } + + MPN_SIZEINBASE_2EXP(count, np, n, 1); /* log (np) + 1 */ + d = (mp_limb_t) (count * logs[trial] + 1e-9) + 1; + ans = perfpow (np, n, d, g, count, neg); + + ret: + TMP_FREE; + return ans; +} diff --git a/gmp-6.3.0/mpn/generic/perfsqr.c b/gmp-6.3.0/mpn/generic/perfsqr.c new file mode 100644 index 0000000..1ea5c84 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/perfsqr.c @@ -0,0 +1,238 @@ +/* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square, + zero otherwise. + +Copyright 1991, 1993, 1994, 1996, 1997, 2000-2002, 2005, 2012 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include /* for NULL */ +#include "gmp-impl.h" +#include "longlong.h" + +#include "perfsqr.h" + + +/* change this to "#define TRACE(x) x" for diagnostics */ +#define TRACE(x) + + + +/* PERFSQR_MOD_* detects non-squares using residue tests. + + A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h. It takes + {up,usize} modulo a selected modulus to get a remainder r. For 32-bit or + 64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34, + or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP + used. PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this + case too. + + PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or + PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table + data indicating residues and non-residues modulo those divisors. The + table data is in 1 or 2 limbs worth of bits respectively, per the size of + each d. + + A "modexact" style remainder is taken to reduce r modulo d. + PERFSQR_MOD_IDX implements this, producing an index "idx" for use with + the table data. Notice there's just one multiplication by a constant + "inv", for each d. + + The modexact doesn't produce a true r%d remainder, instead idx satisfies + "-(idx<> MOD34_BITS); \ + } while (0) + +/* FIXME: The %= here isn't good, and might destroy any savings from keeping + the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm). + Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor + and a shift count, like mpn_preinv_divrem_1. But mod_34lsub1 is our + normal case, so lets not worry too much about mod_1. */ +#define PERFSQR_MOD_PP(r, up, usize) \ + do { \ + if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD)) \ + { \ + (r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM, \ + PERFSQR_PP_INVERTED); \ + (r) %= PERFSQR_PP; \ + } \ + else \ + { \ + (r) = mpn_mod_1 (up, usize, PERFSQR_PP); \ + } \ + } while (0) + +#define PERFSQR_MOD_IDX(idx, r, d, inv) \ + do { \ + mp_limb_t q; \ + ASSERT ((r) <= PERFSQR_MOD_MASK); \ + ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1); \ + ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK); \ + \ + q = ((r) * (inv)) & PERFSQR_MOD_MASK; \ + ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK)); \ + (idx) = (q * (d)) >> PERFSQR_MOD_BITS; \ + } while (0) + +#define PERFSQR_MOD_1(r, d, inv, mask) \ + do { \ + unsigned idx; \ + ASSERT ((d) <= GMP_LIMB_BITS); \ + PERFSQR_MOD_IDX(idx, r, d, inv); \ + TRACE (printf (" PERFSQR_MOD_1 d=%u r=%lu idx=%u\n", \ + d, r%d, idx)); \ + if ((((mask) >> idx) & 1) == 0) \ + { \ + TRACE (printf (" non-square\n")); \ + return 0; \ + } \ + } while (0) + +/* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the + sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch. */ +#define PERFSQR_MOD_2(r, d, inv, mhi, mlo) \ + do { \ + mp_limb_t m; \ + unsigned idx; \ + ASSERT ((d) <= 2*GMP_LIMB_BITS); \ + \ + PERFSQR_MOD_IDX (idx, r, d, inv); \ + TRACE (printf (" PERFSQR_MOD_2 d=%u r=%lu idx=%u\n", \ + d, r%d, idx)); \ + m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi)); \ + idx %= GMP_LIMB_BITS; \ + if (((m >> idx) & 1) == 0) \ + { \ + TRACE (printf (" non-square\n")); \ + return 0; \ + } \ + } while (0) + + +int +mpn_perfect_square_p (mp_srcptr up, mp_size_t usize) +{ + ASSERT (usize >= 1); + + TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize)); + + /* The first test excludes 212/256 (82.8%) of the perfect square candidates + in O(1) time. */ + { + unsigned idx = up[0] % 0x100; + if (((sq_res_0x100[idx / GMP_LIMB_BITS] + >> (idx % GMP_LIMB_BITS)) & 1) == 0) + return 0; + } + +#if 0 + /* Check that we have even multiplicity of 2, and then check that the rest is + a possible perfect square. Leave disabled until we can determine this + really is an improvement. If it is, it could completely replace the + simple probe above, since this should throw out more non-squares, but at + the expense of somewhat more cycles. */ + { + mp_limb_t lo; + int cnt; + lo = up[0]; + while (lo == 0) + up++, lo = up[0], usize--; + count_trailing_zeros (cnt, lo); + if ((cnt & 1) != 0) + return 0; /* return of not even multiplicity of 2 */ + lo >>= cnt; /* shift down to align lowest non-zero bit */ + if ((lo & 6) != 0) + return 0; + } +#endif + + + /* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares + according to their residues modulo small primes (or powers of + primes). See perfsqr.h. */ + PERFSQR_MOD_TEST (up, usize); + + + /* For the third and last test, we finally compute the square root, + to make sure we've really got a perfect square. */ + { + mp_ptr root_ptr; + int res; + TMP_DECL; + + TMP_MARK; + root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2); + + /* Iff mpn_sqrtrem returns zero, the square is perfect. */ + res = ! mpn_sqrtrem (root_ptr, NULL, up, usize); + TMP_FREE; + + return res; + } +} diff --git a/gmp-6.3.0/mpn/generic/popham.c b/gmp-6.3.0/mpn/generic/popham.c new file mode 100644 index 0000000..87974d7 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/popham.c @@ -0,0 +1,125 @@ +/* mpn_popcount, mpn_hamdist -- mpn bit population count/hamming distance. + +Copyright 1994, 1996, 2000-2002, 2005, 2011, 2012 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#if OPERATION_popcount +#define FNAME mpn_popcount +#define POPHAM(u,v) u +#endif + +#if OPERATION_hamdist +#define FNAME mpn_hamdist +#define POPHAM(u,v) u ^ v +#endif + +mp_bitcnt_t +FNAME (mp_srcptr up, +#if OPERATION_hamdist + mp_srcptr vp, +#endif + mp_size_t n) __GMP_NOTHROW +{ + mp_bitcnt_t result = 0; + mp_limb_t p0, p1, p2, p3, x, p01, p23; + mp_size_t i; + + ASSERT (n >= 1); /* Actually, this code handles any n, but some + assembly implementations do not. */ + + for (i = n >> 2; i != 0; i--) + { + p0 = POPHAM (up[0], vp[0]); + p0 -= (p0 >> 1) & MP_LIMB_T_MAX/3; /* 2 0-2 */ + p0 = ((p0 >> 2) & MP_LIMB_T_MAX/5) + (p0 & MP_LIMB_T_MAX/5); /* 4 0-4 */ + + p1 = POPHAM (up[1], vp[1]); + p1 -= (p1 >> 1) & MP_LIMB_T_MAX/3; /* 2 0-2 */ + p1 = ((p1 >> 2) & MP_LIMB_T_MAX/5) + (p1 & MP_LIMB_T_MAX/5); /* 4 0-4 */ + + p01 = p0 + p1; /* 8 0-8 */ + p01 = ((p01 >> 4) & MP_LIMB_T_MAX/17) + (p01 & MP_LIMB_T_MAX/17); /* 8 0-16 */ + + p2 = POPHAM (up[2], vp[2]); + p2 -= (p2 >> 1) & MP_LIMB_T_MAX/3; /* 2 0-2 */ + p2 = ((p2 >> 2) & MP_LIMB_T_MAX/5) + (p2 & MP_LIMB_T_MAX/5); /* 4 0-4 */ + + p3 = POPHAM (up[3], vp[3]); + p3 -= (p3 >> 1) & MP_LIMB_T_MAX/3; /* 2 0-2 */ + p3 = ((p3 >> 2) & MP_LIMB_T_MAX/5) + (p3 & MP_LIMB_T_MAX/5); /* 4 0-4 */ + + p23 = p2 + p3; /* 8 0-8 */ + p23 = ((p23 >> 4) & MP_LIMB_T_MAX/17) + (p23 & MP_LIMB_T_MAX/17); /* 8 0-16 */ + + x = p01 + p23; /* 8 0-32 */ + x = (x >> 8) + x; /* 8 0-64 */ + x = (x >> 16) + x; /* 8 0-128 */ +#if GMP_LIMB_BITS > 32 + x = ((x >> 32) & 0xff) + (x & 0xff); /* 8 0-256 */ + result += x; +#else + result += x & 0xff; +#endif + up += 4; +#if OPERATION_hamdist + vp += 4; +#endif + } + + n &= 3; + if (n != 0) + { + x = 0; + do + { + p0 = POPHAM (up[0], vp[0]); + p0 -= (p0 >> 1) & MP_LIMB_T_MAX/3; /* 2 0-2 */ + p0 = ((p0 >> 2) & MP_LIMB_T_MAX/5) + (p0 & MP_LIMB_T_MAX/5); /* 4 0-4 */ + p0 = ((p0 >> 4) + p0) & MP_LIMB_T_MAX/17; /* 8 0-8 */ + + x += p0; + up += 1; +#if OPERATION_hamdist + vp += 1; +#endif + } + while (--n); + + x = (x >> 8) + x; + x = (x >> 16) + x; +#if GMP_LIMB_BITS > 32 + x = (x >> 32) + x; +#endif + result += x & 0xff; + } + + return result; +} diff --git a/gmp-6.3.0/mpn/generic/pow_1.c b/gmp-6.3.0/mpn/generic/pow_1.c new file mode 100644 index 0000000..de11cd2 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/pow_1.c @@ -0,0 +1,135 @@ +/* mpn_pow_1 -- Compute powers R = U^exp. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2002, 2014 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + +mp_size_t +mpn_pow_1 (mp_ptr rp, mp_srcptr bp, mp_size_t bn, mp_limb_t exp, mp_ptr tp) +{ + mp_limb_t x; + int cnt, i; + mp_size_t rn; + int par; + + ASSERT (bn >= 1); + /* FIXME: Add operand overlap criteria */ + + if (exp <= 1) + { + if (exp == 0) + { + rp[0] = 1; + return 1; + } + else + { + MPN_COPY (rp, bp, bn); + return bn; + } + } + + /* Count number of bits in exp, and compute where to put initial square in + order to magically get results in the entry rp. Use simple code, + optimized for small exp. For large exp, the bignum operations will take + so much time that the slowness of this code will be negligible. */ + par = 0; + cnt = GMP_LIMB_BITS; + x = exp; + do + { + par ^= x; + cnt--; + x >>= 1; + } while (x != 0); + exp <<= cnt; + + if (bn == 1) + { + mp_limb_t rl, rh, bl = bp[0]; + + if ((cnt & 1) != 0) + MP_PTR_SWAP (rp, tp); + + umul_ppmm (rh, rl, bl, bl << GMP_NAIL_BITS); + rp[0] = rl >> GMP_NAIL_BITS; + rp[1] = rh; + rn = 1 + (rh != 0); + + for (i = GMP_LIMB_BITS - cnt - 1;;) + { + exp <<= 1; + if ((exp & GMP_LIMB_HIGHBIT) != 0) + { + rp[rn] = rh = mpn_mul_1 (rp, rp, rn, bl); + rn += rh != 0; + } + + if (--i == 0) + break; + + mpn_sqr (tp, rp, rn); + rn = 2 * rn; rn -= tp[rn - 1] == 0; + MP_PTR_SWAP (rp, tp); + } + } + else + { + if (((par ^ cnt) & 1) == 0) + MP_PTR_SWAP (rp, tp); + + mpn_sqr (rp, bp, bn); + rn = 2 * bn; rn -= rp[rn - 1] == 0; + + for (i = GMP_LIMB_BITS - cnt - 1;;) + { + exp <<= 1; + if ((exp & GMP_LIMB_HIGHBIT) != 0) + { + rn = rn + bn - (mpn_mul (tp, rp, rn, bp, bn) == 0); + MP_PTR_SWAP (rp, tp); + } + + if (--i == 0) + break; + + mpn_sqr (tp, rp, rn); + rn = 2 * rn; rn -= tp[rn - 1] == 0; + MP_PTR_SWAP (rp, tp); + } + } + + return rn; +} diff --git a/gmp-6.3.0/mpn/generic/powlo.c b/gmp-6.3.0/mpn/generic/powlo.c new file mode 100644 index 0000000..c109512 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/powlo.c @@ -0,0 +1,188 @@ +/* mpn_powlo -- Compute R = U^E mod B^n, where B is the limb base. + +Copyright 2007-2009, 2012, 2015, 2016, 2018, 2020 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + + +#define getbit(p,bi) \ + ((p[(bi - 1) / GMP_LIMB_BITS] >> (bi - 1) % GMP_LIMB_BITS) & 1) + +static inline mp_limb_t +getbits (const mp_limb_t *p, mp_bitcnt_t bi, unsigned nbits) +{ + unsigned nbits_in_r; + mp_limb_t r; + mp_size_t i; + + if (bi <= nbits) + { + return p[0] & (((mp_limb_t) 1 << bi) - 1); + } + else + { + bi -= nbits; /* bit index of low bit to extract */ + i = bi / GMP_NUMB_BITS; /* word index of low bit to extract */ + bi %= GMP_NUMB_BITS; /* bit index in low word */ + r = p[i] >> bi; /* extract (low) bits */ + nbits_in_r = GMP_NUMB_BITS - bi; /* number of bits now in r */ + if (nbits_in_r < nbits) /* did we get enough bits? */ + r += p[i + 1] << nbits_in_r; /* prepend bits from higher word */ + return r & (((mp_limb_t) 1 << nbits) - 1); + } +} + +static inline unsigned +win_size (mp_bitcnt_t eb) +{ + unsigned k; + static mp_bitcnt_t x[] = {7,25,81,241,673,1793,4609,11521,28161,~(mp_bitcnt_t)0}; + ASSERT (eb > 1); + for (k = 0; eb > x[k++];) + ; + return k; +} + +/* rp[n-1..0] = bp[n-1..0] ^ ep[en-1..0] mod B^n, B is the limb base. + Requires that ep[en-1] is non-zero. + Uses scratch space tp[3n-1..0], i.e., 3n words. */ +/* We only use n words in the scratch space, we should pass tp + n to + mullo/sqrlo as a temporary area, it is needed. */ +void +mpn_powlo (mp_ptr rp, mp_srcptr bp, + mp_srcptr ep, mp_size_t en, + mp_size_t n, mp_ptr tp) +{ + unsigned cnt; + mp_bitcnt_t ebi; + unsigned windowsize, this_windowsize; + mp_limb_t expbits; + mp_limb_t *pp; + long i; + int flipflop; + TMP_DECL; + + ASSERT (en > 1 || (en == 1 && ep[0] > 1)); + + TMP_MARK; + + MPN_SIZEINBASE_2EXP(ebi, ep, en, 1); + + windowsize = win_size (ebi); + if (windowsize > 1) + { + mp_limb_t *this_pp, *last_pp; + ASSERT (windowsize < ebi); + + pp = TMP_ALLOC_LIMBS ((n << (windowsize - 1))); + + this_pp = pp; + + MPN_COPY (this_pp, bp, n); + + /* Store b^2 in tp. */ + mpn_sqrlo (tp, bp, n); + + /* Precompute odd powers of b and put them in the temporary area at pp. */ + i = (1 << (windowsize - 1)) - 1; + do + { + last_pp = this_pp; + this_pp += n; + mpn_mullo_n (this_pp, last_pp, tp, n); + } while (--i != 0); + + expbits = getbits (ep, ebi, windowsize); + ebi -= windowsize; + + /* THINK: Should we initialise the case expbits % 4 == 0 with a mullo? */ + count_trailing_zeros (cnt, expbits); + ebi += cnt; + expbits >>= cnt; + + MPN_COPY (rp, pp + n * (expbits >> 1), n); + } + else + { + pp = tp + n; + MPN_COPY (pp, bp, n); + MPN_COPY (rp, bp, n); + --ebi; + } + + flipflop = 0; + + do + { + while (getbit (ep, ebi) == 0) + { + mpn_sqrlo (tp, rp, n); + MP_PTR_SWAP (rp, tp); + flipflop = ! flipflop; + if (--ebi == 0) + goto done; + } + + /* The next bit of the exponent is 1. Now extract the largest block of + bits <= windowsize, and such that the least significant bit is 1. */ + + expbits = getbits (ep, ebi, windowsize); + this_windowsize = MIN (windowsize, ebi); + + count_trailing_zeros (cnt, expbits); + this_windowsize -= cnt; + ebi -= this_windowsize; + expbits >>= cnt; + + while (this_windowsize > 1) + { + mpn_sqrlo (tp, rp, n); + mpn_sqrlo (rp, tp, n); + this_windowsize -= 2; + } + + if (this_windowsize != 0) + mpn_sqrlo (tp, rp, n); + else + { + MP_PTR_SWAP (rp, tp); + flipflop = ! flipflop; + } + + mpn_mullo_n (rp, tp, pp + n * (expbits >> 1), n); + } while (ebi != 0); + + done: + if (flipflop) + MPN_COPY (tp, rp, n); + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/powm.c b/gmp-6.3.0/mpn/generic/powm.c new file mode 100644 index 0000000..1e30f2f --- /dev/null +++ b/gmp-6.3.0/mpn/generic/powm.c @@ -0,0 +1,1003 @@ +/* mpn_powm -- Compute R = U^E mod M. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2007-2012, 2019-2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* + BASIC ALGORITHM, Compute U^E mod M, where M < B^n is odd. + + 1. W <- U + + 2. T <- (B^n * U) mod M Convert to REDC form + + 3. Compute table U^1, U^3, U^5... of E-dependent size + + 4. While there are more bits in E + W <- power left-to-right base-k + + + TODO: + + * Make getbits a macro, thereby allowing it to update the index operand. + That will simplify the code using getbits. (Perhaps make getbits' sibling + getbit then have similar form, for symmetry.) + + * Write an itch function. Or perhaps get rid of tp parameter since the huge + pp area is allocated locally anyway? + + * Choose window size without looping. (Superoptimize or think(tm).) + + * Handle small bases with initial, reduction-free exponentiation. + + * Call new division functions, not mpn_tdiv_qr. + + * Consider special code for one-limb M. + + * How should we handle the redc1/redc2/redc_n choice? + - redc1: T(binvert_1limb) + e * (n) * (T(mullo-1x1) + n*T(addmul_1)) + - redc2: T(binvert_2limbs) + e * (n/2) * (T(mullo-2x2) + n*T(addmul_2)) + - redc_n: T(binvert_nlimbs) + e * (T(mullo-nxn) + T(M(n))) + This disregards the addmul_N constant term, but we could think of + that as part of the respective mullo. + + * When U (the base) is small, we should start the exponentiation with plain + operations, then convert that partial result to REDC form. + + * When U is just one limb, should it be handled without the k-ary tricks? + We could keep a factor of B^n in W, but use U' = BU as base. After + multiplying by this (pseudo two-limb) number, we need to multiply by 1/B + mod M. +*/ + +#include "gmp-impl.h" +#include "longlong.h" + +#undef MPN_REDC_0 +#define MPN_REDC_0(r0, u1, u0, m0, invm) \ + do { \ + mp_limb_t _p1, _u1, _u0, _m0, _r0, _dummy; \ + _u0 = (u0); \ + _m0 = (m0); \ + umul_ppmm (_p1, _dummy, _m0, (_u0 * (invm)) & GMP_NUMB_MASK); \ + ASSERT (((_u0 - _dummy) & GMP_NUMB_MASK) == 0); \ + _u1 = (u1); \ + _r0 = _u1 - _p1; \ + _r0 = _u1 < _p1 ? _r0 + _m0 : _r0; /* _u1 < _r0 */ \ + (r0) = _r0 & GMP_NUMB_MASK; \ + } while (0) + +#undef MPN_REDC_1 +#if HAVE_NATIVE_mpn_sbpi1_bdiv_r +#define MPN_REDC_1(rp, up, mp, n, invm) \ + do { \ + mp_limb_t cy; \ + cy = mpn_sbpi1_bdiv_r (up, 2 * n, mp, n, invm); \ + if (cy != 0) \ + mpn_sub_n (rp, up + n, mp, n); \ + else \ + MPN_COPY (rp, up + n, n); \ + } while (0) +#else +#define MPN_REDC_1(rp, up, mp, n, invm) \ + do { \ + mp_limb_t cy; \ + cy = mpn_redc_1 (rp, up, mp, n, invm); \ + if (cy != 0) \ + mpn_sub_n (rp, rp, mp, n); \ + } while (0) +#endif + +#undef MPN_REDC_2 +#define MPN_REDC_2(rp, up, mp, n, mip) \ + do { \ + mp_limb_t cy; \ + cy = mpn_redc_2 (rp, up, mp, n, mip); \ + if (cy != 0) \ + mpn_sub_n (rp, rp, mp, n); \ + } while (0) + +#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2 +#define WANT_REDC_2 1 +#endif + +#define getbit(p,bi) \ + ((p[(bi - 1) / GMP_LIMB_BITS] >> (bi - 1) % GMP_LIMB_BITS) & 1) + +static inline mp_limb_t +getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits) +{ + int nbits_in_r; + mp_limb_t r; + mp_size_t i; + + if (bi <= nbits) + { + return p[0] & (((mp_limb_t) 1 << bi) - 1); + } + else + { + bi -= nbits; /* bit index of low bit to extract */ + i = bi / GMP_NUMB_BITS; /* word index of low bit to extract */ + bi %= GMP_NUMB_BITS; /* bit index in low word */ + r = p[i] >> bi; /* extract (low) bits */ + nbits_in_r = GMP_NUMB_BITS - bi; /* number of bits now in r */ + if (nbits_in_r < nbits) /* did we get enough bits? */ + r += p[i + 1] << nbits_in_r; /* prepend bits from higher word */ + return r & (((mp_limb_t) 1 << nbits) - 1); + } +} + +static inline int +win_size (mp_bitcnt_t eb) +{ + int k; + static mp_bitcnt_t x[] = {7,25,81,241,673,1793,4609,11521,28161,~(mp_bitcnt_t)0}; + for (k = 0; eb > x[k++]; ) + ; + return k; +} + +/* Convert U to REDC form, U_r = B^n * U mod M */ +static void +redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n) +{ + mp_ptr tp, qp; + TMP_DECL; + TMP_MARK; + + TMP_ALLOC_LIMBS_2 (tp, un + n, qp, un + 1); + + MPN_ZERO (tp, n); + MPN_COPY (tp + n, up, un); + mpn_tdiv_qr (qp, rp, 0L, tp, un + n, mp, n); + TMP_FREE; +} + +#if ! HAVE_NATIVE_mpn_rsblsh1_n_ip2 +#undef mpn_rsblsh1_n_ip2 +#if HAVE_NATIVE_mpn_rsblsh1_n +#define mpn_rsblsh1_n_ip2(a,b,n) mpn_rsblsh1_n(a,b,a,n) +#else +#define mpn_rsblsh1_n_ip2(a,b,n) \ + do \ + { \ + mpn_lshift (a, a, n, 1); \ + mpn_sub_n (a, a, b, n); \ + } while (0) +#endif +#endif + +#define INNERLOOP2 \ + do \ + { \ + MPN_SQR (tp, rp, n); \ + MPN_REDUCE (rp, tp, mp, n, mip); \ + if (mpn_cmp (rp, mp, n) >= 0) \ + ASSERT_NOCARRY (mpn_sub_n (rp, rp, mp, n)); \ + if (getbit (ep, ebi) != 0) \ + { \ + if (rp[n - 1] >> (mbi - 1) % GMP_LIMB_BITS == 0) \ + ASSERT_NOCARRY (mpn_lshift (rp, rp, n, 1)); \ + else \ + mpn_rsblsh1_n_ip2 (rp, mp, n); \ + } \ + } while (--ebi != 0) + +/* rp[n-1..0] = 2 ^ ep[en-1..0] mod mp[n-1..0] + Requires that mp[n-1..0] is odd and > 1. + Requires that ep[en-1..0] is > 1. + Uses scratch space at tp of MAX(mpn_binvert_itch(n),2n) limbs. */ +static void +mpn_2powm (mp_ptr rp, mp_srcptr ep, mp_size_t en, + mp_srcptr mp, mp_size_t n, mp_ptr tp) +{ + mp_limb_t ip[2], *mip; + mp_bitcnt_t ebi, mbi, tbi; + mp_size_t tn; + int count; + TMP_DECL; + + ASSERT (en > 1 || (en == 1 && ep[0] > 1)); + ASSERT (n > 0 && (mp[0] & 1) != 0); + + MPN_SIZEINBASE_2EXP(ebi, ep, en, 1); + MPN_SIZEINBASE_2EXP(mbi, mp, n, 1); + + if (LIKELY (mbi <= GMP_NUMB_MAX)) + { + count_leading_zeros(count, (mp_limb_t) mbi); + count = GMP_NUMB_BITS - (count - GMP_NAIL_BITS); + } + else + { + mp_bitcnt_t tc = mbi; + + count = 0; + do { ++count; } while ((tc >>= 1) != 0); + } + + tbi = getbits (ep, ebi, count); + if (tbi >= mbi) + { + --count; + ASSERT ((tbi >> count) == 1); + tbi >>= 1; + ASSERT (tbi < mbi); + ASSERT (ebi > count); + } + else if (ebi <= count) + { + MPN_FILL (rp, n, 0); + rp[tbi / GMP_LIMB_BITS] = CNST_LIMB (1) << (tbi % GMP_LIMB_BITS); + return; + } + ebi -= count; + + if (n == 1) + { + mp_limb_t r0, m0, invm; + m0 = *mp; + + /* redcify (rp, tp, tn + 1, mp, n); */ + /* TODO: test direct use of udiv_qrnnd */ + ASSERT (tbi < GMP_LIMB_BITS); + tp[1] = CNST_LIMB (1) << tbi; + tp[0] = CNST_LIMB (0); + r0 = mpn_mod_1 (tp, 2, m0); + + binvert_limb (invm, m0); + do + { + mp_limb_t t0, t1, t2; + /* MPN_SQR (tp, rp, n); */ + umul_ppmm (t1, t0, r0, r0); + /* MPN_REDUCE (rp, tp, mp, n, mip); */ + MPN_REDC_0(r0, t1, t0, m0, invm); + + t2 = r0 << 1; + t2 = r0 > (m0 >> 1) ? t2 - m0 : t2; + r0 = getbit (ep, ebi) != 0 ? t2 : r0; + } while (--ebi != 0); + + /* tp[1] = 0; tp[0] = r0; */ + /* MPN_REDUCE (rp, tp, mp, n, mip); */ + MPN_REDC_0(*rp, 0, r0, m0, invm); + + return; + } + + TMP_MARK; + +#if WANT_REDC_2 + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { + mip = ip; + binvert_limb (ip[0], mp[0]); + ip[0] = -ip[0]; + } + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + { + mip = ip; + mpn_binvert (ip, mp, 2, tp); + ip[0] = -ip[0]; ip[1] = ~ip[1]; + } +#else + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + { + mip = ip; + binvert_limb (ip[0], mp[0]); + ip[0] = -ip[0]; + } +#endif + else + { + mip = TMP_ALLOC_LIMBS (n); + mpn_binvert (mip, mp, n, tp); + } + + tn = tbi / GMP_LIMB_BITS; + MPN_ZERO (tp, tn); + tp[tn] = CNST_LIMB (1) << (tbi % GMP_LIMB_BITS); + + redcify (rp, tp, tn + 1, mp, n); + +#if WANT_REDC_2 + if (REDC_1_TO_REDC_2_THRESHOLD < MUL_TOOM22_THRESHOLD) + { + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { + if (REDC_1_TO_REDC_2_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + } + else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip) + INNERLOOP2; + } + } + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP2; + } + } + else + { + if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + } + else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP2; + } + } + +#else /* WANT_REDC_2 */ + + if (REDC_1_TO_REDC_N_THRESHOLD < MUL_TOOM22_THRESHOLD) + { + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + { + if (REDC_1_TO_REDC_N_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + } + else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP2; + } + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP2; + } + } + else + { + if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + } + else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP2; + } + else + { +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP2; + } + } +#endif /* WANT_REDC_2 */ + + MPN_COPY (tp, rp, n); + MPN_FILL (tp + n, n, 0); + +#if WANT_REDC_2 + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + MPN_REDC_1 (rp, tp, mp, n, ip[0]); + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + MPN_REDC_2 (rp, tp, mp, n, mip); +#else + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + MPN_REDC_1 (rp, tp, mp, n, ip[0]); +#endif + else + mpn_redc_n (rp, tp, mp, n, mip); + + if (mpn_cmp (rp, mp, n) >= 0) + mpn_sub_n (rp, rp, mp, n); + + TMP_FREE; +} + +/* rp[n-1..0] = bp[bn-1..0] ^ ep[en-1..0] mod mp[n-1..0] + Requires that mp[n-1..0] is odd. + Requires that ep[en-1..0] is > 1. + Uses scratch space at tp of MAX(mpn_binvert_itch(n),2n) limbs. */ +void +mpn_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, + mp_srcptr ep, mp_size_t en, + mp_srcptr mp, mp_size_t n, mp_ptr tp) +{ + mp_limb_t ip[2], *mip; + int cnt; + mp_bitcnt_t ebi; + int windowsize, this_windowsize; + mp_limb_t expbits; + mp_ptr pp, this_pp; + long i; + TMP_DECL; + + ASSERT (en > 1 || (en == 1 && ep[0] > 1)); + ASSERT (n >= 1 && ((mp[0] & 1) != 0)); + + if (bn == 1 && bp[0] == 2) + { + mpn_2powm (rp, ep, en, mp, n, tp); + return; + } + + TMP_MARK; + + MPN_SIZEINBASE_2EXP(ebi, ep, en, 1); + +#if 0 + if (bn < n) + { + /* Do the first few exponent bits without mod reductions, + until the result is greater than the mod argument. */ + for (;;) + { + mpn_sqr (tp, this_pp, tn); + tn = tn * 2 - 1, tn += tp[tn] != 0; + if (getbit (ep, ebi) != 0) + mpn_mul (..., tp, tn, bp, bn); + ebi--; + } + } +#endif + + windowsize = win_size (ebi); + +#if WANT_REDC_2 + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { + mip = ip; + binvert_limb (mip[0], mp[0]); + mip[0] = -mip[0]; + } + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + { + mip = ip; + mpn_binvert (mip, mp, 2, tp); + mip[0] = -mip[0]; mip[1] = ~mip[1]; + } +#else + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + { + mip = ip; + binvert_limb (mip[0], mp[0]); + mip[0] = -mip[0]; + } +#endif + else + { + mip = TMP_ALLOC_LIMBS (n); + mpn_binvert (mip, mp, n, tp); + } + + pp = TMP_ALLOC_LIMBS (n << (windowsize - 1)); + + this_pp = pp; + redcify (this_pp, bp, bn, mp, n); + + /* Store b^2 at rp. */ + mpn_sqr (tp, this_pp, n); +#if 0 + if (n == 1) { + MPN_REDC_0 (rp[0], tp[1], tp[0], mp[0], -mip[0]); + } else +#endif +#if WANT_REDC_2 + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + MPN_REDC_1 (rp, tp, mp, n, mip[0]); + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + MPN_REDC_2 (rp, tp, mp, n, mip); +#else + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + MPN_REDC_1 (rp, tp, mp, n, mip[0]); +#endif + else + mpn_redc_n (rp, tp, mp, n, mip); + + /* Precompute odd powers of b and put them in the temporary area at pp. */ + for (i = (1 << (windowsize - 1)) - 1; i > 0; i--) +#if 1 + if (n == 1) { + umul_ppmm((tp)[1], *(tp), *(this_pp), *(rp)); + ++this_pp ; + MPN_REDC_0 (*this_pp, tp[1], tp[0], *mp, -mip[0]); + } else +#endif + { + mpn_mul_n (tp, this_pp, rp, n); + this_pp += n; +#if WANT_REDC_2 + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + MPN_REDC_1 (this_pp, tp, mp, n, mip[0]); + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + MPN_REDC_2 (this_pp, tp, mp, n, mip); +#else + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + MPN_REDC_1 (this_pp, tp, mp, n, mip[0]); +#endif + else + mpn_redc_n (this_pp, tp, mp, n, mip); + } + + expbits = getbits (ep, ebi, windowsize); + ebi -= windowsize; + + /* THINK: Should we initialise the case expbits % 4 == 0 with a mul? */ + count_trailing_zeros (cnt, expbits); + ebi += cnt; + expbits >>= cnt; + + MPN_COPY (rp, pp + n * (expbits >> 1), n); + +#define INNERLOOP \ + while (ebi != 0) \ + { \ + while (getbit (ep, ebi) == 0) \ + { \ + MPN_SQR (tp, rp, n); \ + MPN_REDUCE (rp, tp, mp, n, mip); \ + if (--ebi == 0) \ + goto done; \ + } \ + \ + /* The next bit of the exponent is 1. Now extract the largest \ + block of bits <= windowsize, and such that the least \ + significant bit is 1. */ \ + \ + expbits = getbits (ep, ebi, windowsize); \ + this_windowsize = MIN (ebi, windowsize); \ + \ + count_trailing_zeros (cnt, expbits); \ + this_windowsize -= cnt; \ + ebi -= this_windowsize; \ + expbits >>= cnt; \ + \ + do \ + { \ + MPN_SQR (tp, rp, n); \ + MPN_REDUCE (rp, tp, mp, n, mip); \ + } \ + while (--this_windowsize != 0); \ + \ + MPN_MUL_N (tp, rp, pp + n * (expbits >> 1), n); \ + MPN_REDUCE (rp, tp, mp, n, mip); \ + } + + + if (n == 1) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) umul_ppmm((r)[1], *(r), *(a), *(b)) +#define MPN_SQR(r,a,n) umul_ppmm((r)[1], *(r), *(a), *(a)) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_0(*(rp), (tp)[1], (tp)[0], *(mp), - *(mip)) + INNERLOOP; + } + else +#if WANT_REDC_2 + if (REDC_1_TO_REDC_2_THRESHOLD < MUL_TOOM22_THRESHOLD) + { + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { + if (REDC_1_TO_REDC_2_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + } + else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip) + INNERLOOP; + } + } + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP; + } + } + else + { + if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + } + else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2 (rp, tp, mp, n, mip) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP; + } + } + +#else /* WANT_REDC_2 */ + + if (REDC_1_TO_REDC_N_THRESHOLD < MUL_TOOM22_THRESHOLD) + { + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + { + if (REDC_1_TO_REDC_N_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + } + else if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP; + } + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP; + } + } + else + { + if (BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) + { + if (MUL_TOOM22_THRESHOLD < SQR_BASECASE_THRESHOLD + || BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_mul_basecase (r,a,n,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_basecase (r,a,n,b,n) +#define MPN_SQR(r,a,n) mpn_sqr_basecase (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + } + else if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1 (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + else + { +#undef MPN_MUL_N +#undef MPN_SQR +#undef MPN_REDUCE +#define MPN_MUL_N(r,a,b,n) mpn_mul_n (r,a,b,n) +#define MPN_SQR(r,a,n) mpn_sqr (r,a,n) +#define MPN_REDUCE(rp,tp,mp,n,mip) mpn_redc_n (rp, tp, mp, n, mip) + INNERLOOP; + } + } +#endif /* WANT_REDC_2 */ + + done: + + MPN_COPY (tp, rp, n); + MPN_ZERO (tp + n, n); + +#if WANT_REDC_2 + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + MPN_REDC_1 (rp, tp, mp, n, mip[0]); + else if (BELOW_THRESHOLD (n, REDC_2_TO_REDC_N_THRESHOLD)) + MPN_REDC_2 (rp, tp, mp, n, mip); +#else + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_N_THRESHOLD)) + MPN_REDC_1 (rp, tp, mp, n, mip[0]); +#endif + else + mpn_redc_n (rp, tp, mp, n, mip); + + if (mpn_cmp (rp, mp, n) >= 0) + mpn_sub_n (rp, rp, mp, n); + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/pre_divrem_1.c b/gmp-6.3.0/mpn/generic/pre_divrem_1.c new file mode 100644 index 0000000..3b29d77 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/pre_divrem_1.c @@ -0,0 +1,145 @@ +/* mpn_preinv_divrem_1 -- mpn by limb division with pre-inverted divisor. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2000-2003 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* Don't bloat a shared library with unused code. */ +#if USE_PREINV_DIVREM_1 + +/* Same test here for skipping one divide step as in mpn_divrem_1. + + The main reason for a separate shift==0 case is that not all CPUs give + zero for "n0 >> GMP_LIMB_BITS" which would arise in the general case + code used on shift==0. shift==0 is also reasonably common in mp_bases + big_base, for instance base==10 on a 64-bit limb. + + Under shift!=0 it would be possible to call mpn_lshift to adjust the + dividend all in one go (into the quotient space say), rather than + limb-by-limb in the loop. This might help if mpn_lshift is a lot faster + than what the compiler can generate for EXTRACT. But this is left to CPU + specific implementations to consider, especially since EXTRACT isn't on + the dependent chain. + + If size==0 then the result is simply xsize limbs of zeros, but nothing + special is done for that, since it wouldn't be a usual call, and + certainly never arises from mpn_get_str which is our main caller. */ + +mp_limb_t +mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t xsize, + mp_srcptr ap, mp_size_t size, mp_limb_t d_unnorm, + mp_limb_t dinv, int shift) +{ + mp_limb_t ahigh, qhigh, r; + mp_size_t i; + mp_limb_t n1, n0; + mp_limb_t d; + + ASSERT (xsize >= 0); + ASSERT (size >= 1); + ASSERT (d_unnorm != 0); +#if WANT_ASSERT + { + int want_shift; + mp_limb_t want_dinv; + count_leading_zeros (want_shift, d_unnorm); + ASSERT (shift == want_shift); + invert_limb (want_dinv, d_unnorm << shift); + ASSERT (dinv == want_dinv); + } +#endif + /* FIXME: What's the correct overlap rule when xsize!=0? */ + ASSERT (MPN_SAME_OR_SEPARATE_P (qp+xsize, ap, size)); + + ahigh = ap[size-1]; + d = d_unnorm << shift; + qp += (size + xsize - 1); /* dest high limb */ + + if (shift == 0) + { + /* High quotient limb is 0 or 1, and skip a divide step. */ + r = ahigh; + qhigh = (r >= d); + r = (qhigh ? r-d : r); + *qp-- = qhigh; + size--; + + for (i = size-1; i >= 0; i--) + { + n0 = ap[i]; + udiv_qrnnd_preinv (*qp, r, r, n0, d, dinv); + qp--; + } + } + else + { + r = 0; + if (ahigh < d_unnorm) + { + r = ahigh << shift; + *qp-- = 0; + size--; + if (size == 0) + goto done_integer; + } + + n1 = ap[size-1]; + r |= n1 >> (GMP_LIMB_BITS - shift); + + for (i = size-2; i >= 0; i--) + { + ASSERT (r < d); + n0 = ap[i]; + udiv_qrnnd_preinv (*qp, r, r, + ((n1 << shift) | (n0 >> (GMP_LIMB_BITS - shift))), + d, dinv); + qp--; + n1 = n0; + } + udiv_qrnnd_preinv (*qp, r, r, n1 << shift, d, dinv); + qp--; + } + + done_integer: + for (i = 0; i < xsize; i++) + { + udiv_qrnnd_preinv (*qp, r, r, CNST_LIMB(0), d, dinv); + qp--; + } + + return r >> shift; +} + +#endif /* USE_PREINV_DIVREM_1 */ diff --git a/gmp-6.3.0/mpn/generic/pre_mod_1.c b/gmp-6.3.0/mpn/generic/pre_mod_1.c new file mode 100644 index 0000000..78ae308 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/pre_mod_1.c @@ -0,0 +1,61 @@ +/* mpn_preinv_mod_1 (up, un, d, dinv) -- Divide (UP,,UN) by the normalized D. + DINV should be 2^(2*GMP_LIMB_BITS) / D - 2^GMP_LIMB_BITS. + Return the single-limb remainder. + +Copyright 1991, 1993, 1994, 2000-2002, 2004, 2005 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* This function used to be documented, but is now considered obsolete. It + continues to exist for binary compatibility, even when not required + internally. */ + +mp_limb_t +mpn_preinv_mod_1 (mp_srcptr up, mp_size_t un, mp_limb_t d, mp_limb_t dinv) +{ + mp_size_t i; + mp_limb_t n0, r; + + ASSERT (un >= 1); + ASSERT (d & GMP_LIMB_HIGHBIT); + + r = up[un - 1]; + if (r >= d) + r -= d; + + for (i = un - 2; i >= 0; i--) + { + n0 = up[i]; + udiv_rnnd_preinv (r, r, n0, d, dinv); + } + return r; +} diff --git a/gmp-6.3.0/mpn/generic/random.c b/gmp-6.3.0/mpn/generic/random.c new file mode 100644 index 0000000..485f9eb --- /dev/null +++ b/gmp-6.3.0/mpn/generic/random.c @@ -0,0 +1,50 @@ +/* mpn_random -- Generate random numbers. + +Copyright 2001, 2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +void +mpn_random (mp_ptr ptr, mp_size_t size) +{ + gmp_randstate_ptr rands; + + /* FIXME: Is size==0 supposed to be allowed? */ + ASSERT (size >= 0); + + if (size == 0) + return; + + rands = RANDS; + _gmp_rand (ptr, rands, size * GMP_NUMB_BITS); + + /* Make sure the most significant limb is non-zero. */ + while (ptr[size-1] == 0) + _gmp_rand (&ptr[size-1], rands, GMP_NUMB_BITS); +} diff --git a/gmp-6.3.0/mpn/generic/random2.c b/gmp-6.3.0/mpn/generic/random2.c new file mode 100644 index 0000000..1eede67 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/random2.c @@ -0,0 +1,105 @@ +/* mpn_random2 -- Generate random numbers with relatively long strings + of ones and zeroes. Suitable for border testing. + +Copyright 1992-1994, 1996, 2000-2002, 2004, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +static void gmp_rrandomb (mp_ptr, gmp_randstate_t, mp_bitcnt_t); + +/* Ask _gmp_rand for 32 bits per call unless that's more than a limb can hold. + Thus, we get the same random number sequence in the common cases. + FIXME: We should always generate the same random number sequence! */ +#if GMP_NUMB_BITS < 32 +#define BITS_PER_RANDCALL GMP_NUMB_BITS +#else +#define BITS_PER_RANDCALL 32 +#endif + +void +mpn_random2 (mp_ptr rp, mp_size_t n) +{ + gmp_randstate_ptr rstate = RANDS; + int bit_pos; /* bit number of least significant bit where + next bit field to be inserted */ + mp_limb_t ran, ranm; /* buffer for random bits */ + + /* FIXME: Is n==0 supposed to be allowed? */ + ASSERT (n >= 0); + + _gmp_rand (&ranm, rstate, BITS_PER_RANDCALL); + ran = ranm; + + /* Start off at a random bit position in the most significant limb. */ + bit_pos = ran % GMP_NUMB_BITS; + + gmp_rrandomb (rp, rstate, n * GMP_NUMB_BITS - bit_pos); +} + +static void +gmp_rrandomb (mp_ptr rp, gmp_randstate_t rstate, mp_bitcnt_t nbits) +{ + mp_bitcnt_t bi; + mp_limb_t ranm; /* buffer for random bits */ + unsigned cap_chunksize, chunksize; + mp_size_t i; + + /* Set entire result to 111..1 */ + i = BITS_TO_LIMBS (nbits) - 1; + rp[i] = GMP_NUMB_MAX >> (GMP_NUMB_BITS - (nbits % GMP_NUMB_BITS)) % GMP_NUMB_BITS; + for (i = i - 1; i >= 0; i--) + rp[i] = GMP_NUMB_MAX; + + _gmp_rand (&ranm, rstate, BITS_PER_RANDCALL); + cap_chunksize = nbits / (ranm % 4 + 1); + cap_chunksize += cap_chunksize == 0; /* make it at least 1 */ + + bi = nbits; + + for (;;) + { + _gmp_rand (&ranm, rstate, BITS_PER_RANDCALL); + chunksize = 1 + ranm % cap_chunksize; + bi = (bi < chunksize) ? 0 : bi - chunksize; + + if (bi == 0) + break; /* low chunk is ...1 */ + + rp[bi / GMP_NUMB_BITS] ^= CNST_LIMB (1) << bi % GMP_NUMB_BITS; + + _gmp_rand (&ranm, rstate, BITS_PER_RANDCALL); + chunksize = 1 + ranm % cap_chunksize; + bi = (bi < chunksize) ? 0 : bi - chunksize; + + mpn_incr_u (rp + bi / GMP_NUMB_BITS, CNST_LIMB (1) << bi % GMP_NUMB_BITS); + + if (bi == 0) + break; /* low chunk is ...0 */ + } +} diff --git a/gmp-6.3.0/mpn/generic/redc_1.c b/gmp-6.3.0/mpn/generic/redc_1.c new file mode 100644 index 0000000..eab128f --- /dev/null +++ b/gmp-6.3.0/mpn/generic/redc_1.c @@ -0,0 +1,56 @@ +/* mpn_redc_1. Set rp[] <- up[]/R^n mod mp[]. Clobber up[]. + mp[] is n limbs; up[] is 2n limbs. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + +Copyright (C) 2000-2002, 2004, 2008, 2009, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +mp_limb_t +mpn_redc_1 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_limb_t invm) +{ + mp_size_t j; + mp_limb_t cy; + + ASSERT (n > 0); + ASSERT_MPN (up, 2*n); + + for (j = n - 1; j >= 0; j--) + { + cy = mpn_addmul_1 (up, mp, n, (up[0] * invm) & GMP_NUMB_MASK); + ASSERT (up[0] == 0); + up[0] = cy; + up++; + } + + cy = mpn_add_n (rp, up, up - n, n); + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/redc_2.c b/gmp-6.3.0/mpn/generic/redc_2.c new file mode 100644 index 0000000..8d15589 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/redc_2.c @@ -0,0 +1,110 @@ +/* mpn_redc_2. Set rp[] <- up[]/R^n mod mp[]. Clobber up[]. + mp[] is n limbs; up[] is 2n limbs. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + +Copyright (C) 2000-2002, 2004, 2008, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +#if GMP_NAIL_BITS != 0 +you lose +#endif + +/* For testing purposes, define our own mpn_addmul_2 if there is none already + available. */ +#ifndef HAVE_NATIVE_mpn_addmul_2 +#undef mpn_addmul_2 +static mp_limb_t +mpn_addmul_2 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_srcptr vp) +{ + rp[n] = mpn_addmul_1 (rp, up, n, vp[0]); + return mpn_addmul_1 (rp + 1, up, n, vp[1]); +} +#endif + +#if defined (__GNUC__) && ! defined (NO_ASM) \ + && defined (__ia64) && W_TYPE_SIZE == 64 +#define umul2low(ph, pl, uh, ul, vh, vl) \ + do { \ + mp_limb_t _ph, _pl; \ + __asm__ ("xma.hu %0 = %3, %5, f0\n\t" \ + "xma.l %1 = %3, %5, f0\n\t" \ + ";;\n\t" \ + "xma.l %0 = %3, %4, %0\n\t" \ + ";;\n\t" \ + "xma.l %0 = %2, %5, %0" \ + : "=&f" (ph), "=&f" (pl) \ + : "f" (uh), "f" (ul), "f" (vh), "f" (vl)); \ + } while (0) +#endif + +#ifndef umul2low +#define umul2low(ph, pl, uh, ul, vh, vl) \ + do { \ + mp_limb_t _ph, _pl; \ + umul_ppmm (_ph, _pl, ul, vl); \ + (ph) = _ph + (ul) * (vh) + (uh) * (vl); \ + (pl) = _pl; \ + } while (0) +#endif + +mp_limb_t +mpn_redc_2 (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_srcptr mip) +{ + mp_limb_t q[2]; + mp_size_t j; + mp_limb_t upn; + mp_limb_t cy; + + ASSERT (n > 0); + ASSERT_MPN (up, 2*n); + + if ((n & 1) != 0) + { + up[0] = mpn_addmul_1 (up, mp, n, (up[0] * mip[0]) & GMP_NUMB_MASK); + up++; + } + + for (j = n - 2; j >= 0; j -= 2) + { + umul2low (q[1], q[0], mip[1], mip[0], up[1], up[0]); + upn = up[n]; /* mpn_addmul_2 overwrites this */ + up[1] = mpn_addmul_2 (up, mp, n, q); + up[0] = up[n]; + up[n] = upn; + up += 2; + } + + cy = mpn_add_n (rp, up, up - n, n); + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/redc_n.c b/gmp-6.3.0/mpn/generic/redc_n.c new file mode 100644 index 0000000..0c94b7c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/redc_n.c @@ -0,0 +1,80 @@ +/* mpn_redc_n. Set rp[] <- up[]/R^n mod mp[]. Clobber up[]. + mp[] is n limbs; up[] is 2n limbs, the inverse ip[] is n limbs. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + +Copyright 2009, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* + TODO + + * We assume mpn_mulmod_bnm1 is always faster than plain mpn_mul_n (or a + future mpn_mulhi) for the range we will be called. Follow up that + assumption. + + * Decrease scratch usage. + + * Consider removing the residue canonicalisation. +*/ + +void +mpn_redc_n (mp_ptr rp, mp_ptr up, mp_srcptr mp, mp_size_t n, mp_srcptr ip) +{ + mp_ptr xp, yp, scratch; + mp_limb_t cy; + mp_size_t rn; + TMP_DECL; + TMP_MARK; + + ASSERT (n > 8); + + rn = mpn_mulmod_bnm1_next_size (n); + + scratch = TMP_ALLOC_LIMBS (n + rn + mpn_mulmod_bnm1_itch (rn, n, n)); + + xp = scratch; + mpn_mullo_n (xp, up, ip, n); + + yp = scratch + n; + mpn_mulmod_bnm1 (yp, rn, xp, n, mp, n, scratch + n + rn); + + ASSERT_ALWAYS (2 * n > rn); /* could handle this */ + + cy = mpn_sub_n (yp + rn, yp, up, 2*n - rn); /* undo wrap around */ + MPN_DECR_U (yp + 2*n - rn, rn, cy); + + cy = mpn_sub_n (rp, up + n, yp + n, n); + if (cy != 0) + mpn_add_n (rp, rp, mp, n); + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/remove.c b/gmp-6.3.0/mpn/generic/remove.c new file mode 100644 index 0000000..cbb0742 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/remove.c @@ -0,0 +1,182 @@ +/* mpn_remove -- divide out all multiples of odd mpn number from another mpn + number. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2009, 2012-2014, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#if GMP_LIMB_BITS > 50 +#define LOG 50 +#else +#define LOG GMP_LIMB_BITS +#endif + + +/* Input: U = {up,un}, V = {vp,vn} must be odd, cap + Ouput W = {wp,*wn} allocation need is exactly *wn + + Set W = U / V^k, where k is the largest integer <= cap such that the + division yields an integer. + + FIXME: We currently allow any operand overlap. This is quite non mpn-ish + and might be changed, since it cost significant temporary space. + * If we require W to have space for un + 1 limbs, we could save qp or qp2 + (but we will still need to copy things into wp 50% of the time). + * If we allow ourselves to clobber U, we could save the other of qp and qp2, + and the initial COPY (but also here we would need un + 1 limbs). +*/ + +/* FIXME: We need to wrap mpn_bdiv_qr due to the itch interface. This need + indicates a flaw in the current itch mechanism: Which operands not greater + than un,un will incur the worst itch? We need a parallel foo_maxitch set + of functions. */ +static void +mpn_bdiv_qr_wrap (mp_ptr qp, mp_ptr rp, + mp_srcptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn) +{ + mp_ptr scratch_out; + TMP_DECL; + + TMP_MARK; + scratch_out = TMP_ALLOC_LIMBS (mpn_bdiv_qr_itch (nn, dn)); + mpn_bdiv_qr (qp, rp, np, nn, dp, dn, scratch_out); + + TMP_FREE; +} + +mp_bitcnt_t +mpn_remove (mp_ptr wp, mp_size_t *wn, + mp_srcptr up, mp_size_t un, mp_srcptr vp, mp_size_t vn, + mp_bitcnt_t cap) +{ + mp_srcptr pwpsp[LOG]; + mp_size_t pwpsn[LOG]; + mp_size_t npowers; + mp_ptr tp, qp, np, qp2; + mp_srcptr pp; + mp_size_t pn, nn, qn, i; + mp_bitcnt_t pwr; + TMP_DECL; + + ASSERT (un > 0); + ASSERT (vn > 0); + ASSERT (vp[0] % 2 != 0); /* 2-adic division wants odd numbers */ + ASSERT (vn > 1 || vp[0] > 1); /* else we would loop indefinitely */ + + TMP_MARK; + + TMP_ALLOC_LIMBS_3 (qp, un + 1, /* quotient, alternating */ + qp2, un + 1, /* quotient, alternating */ + tp, (un + 1 + vn) / 2); /* remainder */ + pp = vp; + pn = vn; + + MPN_COPY (qp, up, un); + qn = un; + + npowers = 0; + while (qn >= pn) + { + qp[qn] = 0; + mpn_bdiv_qr_wrap (qp2, tp, qp, qn + 1, pp, pn); + if (!mpn_zero_p (tp, pn)) + { + if (mpn_cmp (tp, pp, pn) != 0) + break; /* could not divide by V^npowers */ + } + + MP_PTR_SWAP (qp, qp2); + qn = qn - pn; + mpn_neg (qp, qp, qn+1); + + qn += qp[qn] != 0; + + pwpsp[npowers] = pp; + pwpsn[npowers] = pn; + ++npowers; + + if (((mp_bitcnt_t) 2 << npowers) - 1 > cap) + break; + + nn = 2 * pn - 1; /* next power will be at least this large */ + if (nn > qn) + break; /* next power would be overlarge */ + + if (npowers == 1) /* Alloc once, but only if it's needed */ + np = TMP_ALLOC_LIMBS (qn + LOG); /* powers of V */ + else + np += pn; + + mpn_sqr (np, pp, pn); + pn = nn + (np[nn] != 0); + pp = np; + } + + pwr = ((mp_bitcnt_t) 1 << npowers) - 1; + + for (i = npowers; --i >= 0;) + { + pn = pwpsn[i]; + if (qn < pn) + continue; + + if (pwr + ((mp_bitcnt_t) 1 << i) > cap) + continue; /* V^i would bring us past cap */ + + qp[qn] = 0; + mpn_bdiv_qr_wrap (qp2, tp, qp, qn + 1, pwpsp[i], pn); + if (!mpn_zero_p (tp, pn)) + { + if (mpn_cmp (tp, pwpsp[i], pn) != 0) + continue; /* could not divide by V^i */ + } + + MP_PTR_SWAP (qp, qp2); + qn = qn - pn; + mpn_neg (qp, qp, qn+1); + + qn += qp[qn] != 0; + + pwr += (mp_bitcnt_t) 1 << i; + } + + MPN_COPY (wp, qp, qn); + *wn = qn; + + TMP_FREE; + + return pwr; +} diff --git a/gmp-6.3.0/mpn/generic/rootrem.c b/gmp-6.3.0/mpn/generic/rootrem.c new file mode 100644 index 0000000..a79099e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/rootrem.c @@ -0,0 +1,515 @@ +/* mpn_rootrem(rootp,remp,ap,an,nth) -- Compute the nth root of {ap,an}, and + store the truncated integer part at rootp and the remainder at remp. + + Contributed by Paul Zimmermann (algorithm) and + Paul Zimmermann and Torbjorn Granlund (implementation). + Marco Bodrato wrote logbased_root to seed the loop. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL, AND HAVE MUTABLE INTERFACES. IT'S + ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT'S ALMOST + GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2002, 2005, 2009-2012, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* FIXME: + This implementation is not optimal when remp == NULL, since the complexity + is M(n), whereas it should be M(n/k) on average. +*/ + +#include /* for NULL */ + +#include "gmp-impl.h" +#include "longlong.h" + +static mp_size_t mpn_rootrem_internal (mp_ptr, mp_ptr, mp_srcptr, mp_size_t, + mp_limb_t, int); + +#define MPN_RSHIFT(rp,up,un,cnt) \ + do { \ + if ((cnt) != 0) \ + mpn_rshift (rp, up, un, cnt); \ + else \ + { \ + MPN_COPY_INCR (rp, up, un); \ + } \ + } while (0) + +#define MPN_LSHIFT(cy,rp,up,un,cnt) \ + do { \ + if ((cnt) != 0) \ + cy = mpn_lshift (rp, up, un, cnt); \ + else \ + { \ + MPN_COPY_DECR (rp, up, un); \ + cy = 0; \ + } \ + } while (0) + + +/* Put in {rootp, ceil(un/k)} the kth root of {up, un}, rounded toward zero. + If remp <> NULL, put in {remp, un} the remainder. + Return the size (in limbs) of the remainder if remp <> NULL, + or a non-zero value iff the remainder is non-zero when remp = NULL. + Assumes: + (a) up[un-1] is not zero + (b) rootp has at least space for ceil(un/k) limbs + (c) remp has at least space for un limbs (in case remp <> NULL) + (d) the operands do not overlap. + + The auxiliary memory usage is 3*un+2 if remp = NULL, + and 2*un+2 if remp <> NULL. FIXME: This is an incorrect comment. +*/ +mp_size_t +mpn_rootrem (mp_ptr rootp, mp_ptr remp, + mp_srcptr up, mp_size_t un, mp_limb_t k) +{ + ASSERT (un > 0); + ASSERT (up[un - 1] != 0); + ASSERT (k > 1); + + if (UNLIKELY (k == 2)) + return mpn_sqrtrem (rootp, remp, up, un); + /* (un-1)/k > 2 <=> un > 3k <=> (un + 2)/3 > k */ + if (remp == NULL && (un + 2) / 3 > k) + /* Pad {up,un} with k zero limbs. This will produce an approximate root + with one more limb, allowing us to compute the exact integral result. */ + { + mp_ptr sp, wp; + mp_size_t rn, sn, wn; + TMP_DECL; + TMP_MARK; + wn = un + k; + sn = (un - 1) / k + 2; /* ceil(un/k) + 1 */ + TMP_ALLOC_LIMBS_2 (wp, wn, /* will contain the padded input */ + sp, sn); /* approximate root of padded input */ + MPN_COPY (wp + k, up, un); + MPN_FILL (wp, k, 0); + rn = mpn_rootrem_internal (sp, NULL, wp, wn, k, 1); + /* The approximate root S = {sp,sn} is either the correct root of + {sp,sn}, or 1 too large. Thus unless the least significant limb of + S is 0 or 1, we can deduce the root of {up,un} is S truncated by one + limb. (In case sp[0]=1, we can deduce the root, but not decide + whether it is exact or not.) */ + MPN_COPY (rootp, sp + 1, sn - 1); + TMP_FREE; + return rn; + } + else + { + return mpn_rootrem_internal (rootp, remp, up, un, k, 0); + } +} + +#define LOGROOT_USED_BITS 8 +#define LOGROOT_NEEDS_TWO_CORRECTIONS 1 +#define LOGROOT_RETURNED_BITS (LOGROOT_USED_BITS + LOGROOT_NEEDS_TWO_CORRECTIONS) +/* Puts in *rootp some bits of the k^nt root of the number + 2^bitn * 1.op ; where op represents the "fractional" bits. + + The returned value is the number of bits of the root minus one; + i.e. an approximation of the root will be + (*rootp) * 2^(retval-LOGROOT_RETURNED_BITS+1). + + Currently, only LOGROOT_USED_BITS bits of op are used (the implicit + one is not counted). + */ +static unsigned +logbased_root (mp_ptr rootp, mp_limb_t op, mp_bitcnt_t bitn, mp_limb_t k) +{ + /* vlog=vector(256,i,floor((log(256+i)/log(2)-8)*256)-(i>255)) */ + static const + unsigned char vlog[] = {1, 2, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 18, 19, 21, 22, + 23, 25, 26, 27, 29, 30, 31, 33, 34, 35, 37, 38, 39, 40, 42, 43, + 44, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 59, 61, 62, 63, + 64, 65, 67, 68, 69, 70, 71, 73, 74, 75, 76, 77, 78, 80, 81, 82, + 83, 84, 85, 87, 88, 89, 90, 91, 92, 93, 94, 96, 97, 98, 99, 100, + 101, 102, 103, 104, 105, 106, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, + 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134, + 135, 136, 137, 138, 139, 140, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 162, 163, 164, + 165, 166, 167, 168, 169, 170, 171, 172, 173, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 181, 182, 183, 184, 185, 186, 187, 188, 188, 189, 190, 191, 192, 193, + 194, 194, 195, 196, 197, 198, 199, 200, 200, 201, 202, 203, 204, 205, 205, 206, + 207, 208, 209, 209, 210, 211, 212, 213, 214, 214, 215, 216, 217, 218, 218, 219, + 220, 221, 222, 222, 223, 224, 225, 225, 226, 227, 228, 229, 229, 230, 231, 232, + 232, 233, 234, 235, 235, 236, 237, 238, 239, 239, 240, 241, 242, 242, 243, 244, + 245, 245, 246, 247, 247, 248, 249, 250, 250, 251, 252, 253, 253, 254, 255, 255}; + + /* vexp=vector(256,i,floor(2^(8+i/256)-256)-(i>255)) */ + static const + unsigned char vexp[] = {0, 1, 2, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 9, 10, 11, + 12, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 23, + 23, 24, 25, 26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, + 36, 37, 37, 38, 39, 40, 41, 41, 42, 43, 44, 45, 45, 46, 47, 48, + 49, 50, 50, 51, 52, 53, 54, 55, 55, 56, 57, 58, 59, 60, 61, 61, + 62, 63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, + 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86, 87, 88, 89, 90, + 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, + 107, 108, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 119, 120, 121, 122, + 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, + 139, 140, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 154, 155, 156, + 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 171, 172, 173, 174, + 175, 176, 178, 179, 180, 181, 182, 183, 185, 186, 187, 188, 189, 191, 192, 193, + 194, 196, 197, 198, 199, 200, 202, 203, 204, 205, 207, 208, 209, 210, 212, 213, + 214, 216, 217, 218, 219, 221, 222, 223, 225, 226, 227, 229, 230, 231, 232, 234, + 235, 236, 238, 239, 240, 242, 243, 245, 246, 247, 249, 250, 251, 253, 254, 255}; + mp_bitcnt_t retval; + + if (UNLIKELY (bitn > (~ (mp_bitcnt_t) 0) >> LOGROOT_USED_BITS)) + { + /* In the unlikely case, we use two divisions and a modulo. */ + retval = bitn / k; + bitn %= k; + bitn = (bitn << LOGROOT_USED_BITS | + vlog[op >> (GMP_NUMB_BITS - LOGROOT_USED_BITS)]) / k; + } + else + { + bitn = (bitn << LOGROOT_USED_BITS | + vlog[op >> (GMP_NUMB_BITS - LOGROOT_USED_BITS)]) / k; + retval = bitn >> LOGROOT_USED_BITS; + bitn &= (CNST_LIMB (1) << LOGROOT_USED_BITS) - 1; + } + ASSERT(bitn < CNST_LIMB (1) << LOGROOT_USED_BITS); + *rootp = CNST_LIMB(1) << (LOGROOT_USED_BITS - ! LOGROOT_NEEDS_TWO_CORRECTIONS) + | vexp[bitn] >> ! LOGROOT_NEEDS_TWO_CORRECTIONS; + return retval; +} + +/* if approx is non-zero, does not compute the final remainder */ +static mp_size_t +mpn_rootrem_internal (mp_ptr rootp, mp_ptr remp, mp_srcptr up, mp_size_t un, + mp_limb_t k, int approx) +{ + mp_ptr qp, rp, sp, wp, scratch; + mp_size_t qn, rn, sn, wn, nl, bn; + mp_limb_t save, save2, cy, uh; + mp_bitcnt_t unb; /* number of significant bits of {up,un} */ + mp_bitcnt_t xnb; /* number of significant bits of the result */ + mp_bitcnt_t b, kk; + mp_bitcnt_t sizes[GMP_NUMB_BITS + 1]; + int ni; + int perf_pow; + unsigned ulz, snb, c, logk; + TMP_DECL; + + /* MPN_SIZEINBASE_2EXP(unb, up, un, 1); --unb; */ + uh = up[un - 1]; + count_leading_zeros (ulz, uh); + ulz = ulz - GMP_NAIL_BITS + 1; /* Ignore the first 1. */ + unb = (mp_bitcnt_t) un * GMP_NUMB_BITS - ulz; + /* unb is the (truncated) logarithm of the input U in base 2*/ + + if (unb < k) /* root is 1 */ + { + rootp[0] = 1; + if (remp == NULL) + un -= (*up == CNST_LIMB (1)); /* Non-zero iif {up,un} > 1 */ + else + { + mpn_sub_1 (remp, up, un, CNST_LIMB (1)); + un -= (remp [un - 1] == 0); /* There should be at most one zero limb, + if we demand u to be normalized */ + } + return un; + } + /* if (unb - k < k/2 + k/16) // root is 2 */ + + if (ulz == GMP_NUMB_BITS) + uh = up[un - 2]; + else + uh = (uh << ulz & GMP_NUMB_MASK) | up[un - 1 - (un != 1)] >> (GMP_NUMB_BITS - ulz); + ASSERT (un != 1 || up[un - 1 - (un != 1)] >> (GMP_NUMB_BITS - ulz) == 1); + + xnb = logbased_root (rootp, uh, unb, k); + snb = LOGROOT_RETURNED_BITS - 1; + /* xnb+1 is the number of bits of the root R */ + /* snb+1 is the number of bits of the current approximation S */ + + kk = k * xnb; /* number of truncated bits in the input */ + + /* FIXME: Should we skip the next two loops when xnb <= snb ? */ + for (uh = (k - 1) / 2, logk = 3; (uh >>= 1) != 0; ++logk ) + ; + /* logk = ceil(log(k)/log(2)) + 1 */ + + /* xnb is the number of remaining bits to determine in the kth root */ + for (ni = 0; (sizes[ni] = xnb) > snb; ++ni) + { + /* invariant: here we want xnb+1 total bits for the kth root */ + + /* if c is the new value of xnb, this means that we'll go from a + root of c+1 bits (say s') to a root of xnb+1 bits. + It is proved in the book "Modern Computer Arithmetic" by Brent + and Zimmermann, Chapter 1, that + if s' >= k*beta, then at most one correction is necessary. + Here beta = 2^(xnb-c), and s' >= 2^c, thus it suffices that + c >= ceil((xnb + log2(k))/2). */ + if (xnb > logk) + xnb = (xnb + logk) / 2; + else + --xnb; /* add just one bit at a time */ + } + + *rootp >>= snb - xnb; + kk -= xnb; + + ASSERT_ALWAYS (ni < GMP_NUMB_BITS + 1); + /* We have sizes[0] = b > sizes[1] > ... > sizes[ni] = 0 with + sizes[i] <= 2 * sizes[i+1]. + Newton iteration will first compute sizes[ni-1] extra bits, + then sizes[ni-2], ..., then sizes[0] = b. */ + + TMP_MARK; + /* qp and wp need enough space to store S'^k where S' is an approximate + root. Since S' can be as large as S+2, the worst case is when S=2 and + S'=4. But then since we know the number of bits of S in advance, S' + can only be 3 at most. Similarly for S=4, then S' can be 6 at most. + So the worst case is S'/S=3/2, thus S'^k <= (3/2)^k * S^k. Since S^k + fits in un limbs, the number of extra limbs needed is bounded by + ceil(k*log2(3/2)/GMP_NUMB_BITS). */ + /* THINK: with the use of logbased_root, maybe the constant is + 258/256 instead of 3/2 ? log2(258/256) < 1/89 < 1/64 */ +#define EXTRA 2 + (mp_size_t) (0.585 * (double) k / (double) GMP_NUMB_BITS) + TMP_ALLOC_LIMBS_3 (scratch, un + 1, /* used by mpn_div_q */ + qp, un + EXTRA, /* will contain quotient and remainder + of R/(k*S^(k-1)), and S^k */ + wp, un + EXTRA); /* will contain S^(k-1), k*S^(k-1), + and temporary for mpn_pow_1 */ + + if (remp == NULL) + rp = scratch; /* will contain the remainder */ + else + rp = remp; + sp = rootp; + + sn = 1; /* Initial approximation has one limb */ + + for (b = xnb; ni != 0; --ni) + { + /* 1: loop invariant: + {sp, sn} is the current approximation of the root, which has + exactly 1 + sizes[ni] bits. + {rp, rn} is the current remainder + {wp, wn} = {sp, sn}^(k-1) + kk = number of truncated bits of the input + */ + + /* Since each iteration treats b bits from the root and thus k*b bits + from the input, and we already considered b bits from the input, + we now have to take another (k-1)*b bits from the input. */ + kk -= (k - 1) * b; /* remaining input bits */ + /* {rp, rn} = floor({up, un} / 2^kk) */ + rn = un - kk / GMP_NUMB_BITS; + MPN_RSHIFT (rp, up + kk / GMP_NUMB_BITS, rn, kk % GMP_NUMB_BITS); + rn -= rp[rn - 1] == 0; + + /* 9: current buffers: {sp,sn}, {rp,rn} */ + + for (c = 0;; c++) + { + /* Compute S^k in {qp,qn}. */ + /* W <- S^(k-1) for the next iteration, + and S^k = W * S. */ + wn = mpn_pow_1 (wp, sp, sn, k - 1, qp); + mpn_mul (qp, wp, wn, sp, sn); + qn = wn + sn; + qn -= qp[qn - 1] == 0; + + perf_pow = 1; + /* if S^k > floor(U/2^kk), the root approximation was too large */ + if (qn > rn || (qn == rn && (perf_pow=mpn_cmp (qp, rp, rn)) > 0)) + MPN_DECR_U (sp, sn, 1); + else + break; + } + + /* 10: current buffers: {sp,sn}, {rp,rn}, {qp,qn}, {wp,wn} */ + + /* sometimes two corrections are needed with logbased_root*/ + ASSERT (c <= 1 + LOGROOT_NEEDS_TWO_CORRECTIONS); + ASSERT_ALWAYS (rn >= qn); + + b = sizes[ni - 1] - sizes[ni]; /* number of bits to compute in the + next iteration */ + bn = b / GMP_NUMB_BITS; /* lowest limb from high part of rp[], after shift */ + + kk = kk - b; + /* nl is the number of limbs in U which contain bits [kk,kk+b-1] */ + nl = 1 + (kk + b - 1) / GMP_NUMB_BITS - (kk / GMP_NUMB_BITS); + /* nl = 1 + floor((kk + b - 1) / GMP_NUMB_BITS) + - floor(kk / GMP_NUMB_BITS) + <= 1 + (kk + b - 1) / GMP_NUMB_BITS + - (kk - GMP_NUMB_BITS + 1) / GMP_NUMB_BITS + = 2 + (b - 2) / GMP_NUMB_BITS + thus since nl is an integer: + nl <= 2 + floor(b/GMP_NUMB_BITS) <= 2 + bn. */ + + /* 11: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ + + /* R = R - Q = floor(U/2^kk) - S^k */ + if (perf_pow != 0) + { + mpn_sub (rp, rp, rn, qp, qn); + MPN_NORMALIZE_NOT_ZERO (rp, rn); + + /* first multiply the remainder by 2^b */ + MPN_LSHIFT (cy, rp + bn, rp, rn, b % GMP_NUMB_BITS); + rn = rn + bn; + if (cy != 0) + { + rp[rn] = cy; + rn++; + } + + save = rp[bn]; + /* we have to save rp[bn] up to rp[nl-1], i.e. 1 or 2 limbs */ + if (nl - 1 > bn) + save2 = rp[bn + 1]; + } + else + { + rn = bn; + save2 = save = 0; + } + /* 2: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ + + /* Now insert bits [kk,kk+b-1] from the input U */ + MPN_RSHIFT (rp, up + kk / GMP_NUMB_BITS, nl, kk % GMP_NUMB_BITS); + /* set to zero high bits of rp[bn] */ + rp[bn] &= (CNST_LIMB (1) << (b % GMP_NUMB_BITS)) - 1; + /* restore corresponding bits */ + rp[bn] |= save; + if (nl - 1 > bn) + rp[bn + 1] = save2; /* the low b bits go in rp[0..bn] only, since + they start by bit 0 in rp[0], so they use + at most ceil(b/GMP_NUMB_BITS) limbs */ + /* FIXME: Should we normalise {rp,rn} here ?*/ + + /* 3: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ + + /* compute {wp, wn} = k * {sp, sn}^(k-1) */ + cy = mpn_mul_1 (wp, wp, wn, k); + wp[wn] = cy; + wn += cy != 0; + + /* 6: current buffers: {sp,sn}, {qp,qn} */ + + /* multiply the root approximation by 2^b */ + MPN_LSHIFT (cy, sp + b / GMP_NUMB_BITS, sp, sn, b % GMP_NUMB_BITS); + sn = sn + b / GMP_NUMB_BITS; + if (cy != 0) + { + sp[sn] = cy; + sn++; + } + + save = sp[b / GMP_NUMB_BITS]; + + /* Number of limbs used by b bits, when least significant bit is + aligned to least limb */ + bn = (b - 1) / GMP_NUMB_BITS + 1; + + /* 4: current buffers: {sp,sn}, {rp,rn}, {wp,wn} */ + + /* now divide {rp, rn} by {wp, wn} to get the low part of the root */ + if (UNLIKELY (rn < wn)) + { + MPN_FILL (sp, bn, 0); + } + else + { + qn = rn - wn; /* expected quotient size */ + if (qn <= bn) { /* Divide only if result is not too big. */ + mpn_div_q (qp, rp, rn, wp, wn, scratch); + qn += qp[qn] != 0; + } + + /* 5: current buffers: {sp,sn}, {qp,qn}. + Note: {rp,rn} is not needed any more since we'll compute it from + scratch at the end of the loop. + */ + + /* the quotient should be smaller than 2^b, since the previous + approximation was correctly rounded toward zero */ + if (qn > bn || (qn == bn && (b % GMP_NUMB_BITS != 0) && + qp[qn - 1] >= (CNST_LIMB (1) << (b % GMP_NUMB_BITS)))) + { + for (qn = 1; qn < bn; ++qn) + sp[qn - 1] = GMP_NUMB_MAX; + sp[qn - 1] = GMP_NUMB_MAX >> (GMP_NUMB_BITS - 1 - ((b - 1) % GMP_NUMB_BITS)); + } + else + { + /* 7: current buffers: {sp,sn}, {qp,qn} */ + + /* Combine sB and q to form sB + q. */ + MPN_COPY (sp, qp, qn); + MPN_ZERO (sp + qn, bn - qn); + } + } + sp[b / GMP_NUMB_BITS] |= save; + + /* 8: current buffer: {sp,sn} */ + + } + + /* otherwise we have rn > 0, thus the return value is ok */ + if (!approx || sp[0] <= CNST_LIMB (1)) + { + for (c = 0;; c++) + { + /* Compute S^k in {qp,qn}. */ + /* Last iteration: we don't need W anymore. */ + /* mpn_pow_1 requires that both qp and wp have enough + space to store the result {sp,sn}^k + 1 limb */ + qn = mpn_pow_1 (qp, sp, sn, k, wp); + + perf_pow = 1; + if (qn > un || (qn == un && (perf_pow=mpn_cmp (qp, up, un)) > 0)) + MPN_DECR_U (sp, sn, 1); + else + break; + }; + + /* sometimes two corrections are needed with logbased_root*/ + ASSERT (c <= 1 + LOGROOT_NEEDS_TWO_CORRECTIONS); + + rn = perf_pow != 0; + if (rn != 0 && remp != NULL) + { + mpn_sub (remp, up, un, qp, qn); + rn = un; + MPN_NORMALIZE_NOT_ZERO (remp, rn); + } + } + + TMP_FREE; + return rn; +} diff --git a/gmp-6.3.0/mpn/generic/rshift.c b/gmp-6.3.0/mpn/generic/rshift.c new file mode 100644 index 0000000..15d427d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/rshift.c @@ -0,0 +1,69 @@ +/* mpn_rshift -- Shift right low level. + +Copyright 1991, 1993, 1994, 1996, 2000-2002 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Shift U (pointed to by up and N limbs long) cnt bits to the right + and store the n least significant limbs of the result at rp. + The bits shifted out to the right are returned. + + Argument constraints: + 1. 0 < cnt < GMP_NUMB_BITS. + 2. If the result is to be written over the input, rp must be <= up. +*/ + +mp_limb_t +mpn_rshift (mp_ptr rp, mp_srcptr up, mp_size_t n, unsigned int cnt) +{ + mp_limb_t high_limb, low_limb; + unsigned int tnc; + mp_size_t i; + mp_limb_t retval; + + ASSERT (n >= 1); + ASSERT (cnt >= 1); + ASSERT (cnt < GMP_NUMB_BITS); + ASSERT (MPN_SAME_OR_INCR_P (rp, up, n)); + + tnc = GMP_NUMB_BITS - cnt; + high_limb = *up++; + retval = (high_limb << tnc) & GMP_NUMB_MASK; + low_limb = high_limb >> cnt; + + for (i = n - 1; i != 0; i--) + { + high_limb = *up++; + *rp++ = low_limb | ((high_limb << tnc) & GMP_NUMB_MASK); + low_limb = high_limb >> cnt; + } + *rp = low_limb; + + return retval; +} diff --git a/gmp-6.3.0/mpn/generic/sbpi1_bdiv_q.c b/gmp-6.3.0/mpn/generic/sbpi1_bdiv_q.c new file mode 100644 index 0000000..850e593 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sbpi1_bdiv_q.c @@ -0,0 +1,96 @@ +/* mpn_sbpi1_bdiv_q -- schoolbook Hensel division with precomputed inverse, + returning quotient only. + + Contributed to the GNU project by Niels Möller and Torbjörn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. + IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS + ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2005, 2006, 2009, 2011, 2012, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* Computes Q = - U / D mod B^un, destroys U. + + D must be odd. dinv is (-D)^-1 mod B. + +*/ + +void +mpn_sbpi1_bdiv_q (mp_ptr qp, + mp_ptr up, mp_size_t un, + mp_srcptr dp, mp_size_t dn, + mp_limb_t dinv) +{ + mp_size_t i; + mp_limb_t q; + + ASSERT (dn > 0); + ASSERT (un >= dn); + ASSERT ((dp[0] & 1) != 0); + ASSERT (-(dp[0] * dinv) == 1); + ASSERT (up == qp || !MPN_OVERLAP_P (up, un, qp, un - dn)); + + if (un > dn) + { + mp_limb_t cy, hi; + for (i = un - dn - 1, cy = 0; i > 0; i--) + { + q = dinv * up[0]; + hi = mpn_addmul_1 (up, dp, dn, q); + + ASSERT (up[0] == 0); + *qp++ = q; + hi += cy; + cy = hi < cy; + hi += up[dn]; + cy += hi < up[dn]; + up[dn] = hi; + up++; + } + q = dinv * up[0]; + hi = cy + mpn_addmul_1 (up, dp, dn, q); + ASSERT (up[0] == 0); + *qp++ = q; + up[dn] += hi; + up++; + } + for (i = dn; i > 1; i--) + { + mp_limb_t q = dinv * up[0]; + mpn_addmul_1 (up, dp, i, q); + ASSERT (up[0] == 0); + *qp++ = q; + up++; + } + + /* Final limb */ + *qp = dinv * up[0]; +} diff --git a/gmp-6.3.0/mpn/generic/sbpi1_bdiv_qr.c b/gmp-6.3.0/mpn/generic/sbpi1_bdiv_qr.c new file mode 100644 index 0000000..6146c45 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sbpi1_bdiv_qr.c @@ -0,0 +1,82 @@ +/* mpn_sbpi1_bdiv_qr -- schoolbook Hensel division with precomputed inverse, + returning quotient and remainder. + + Contributed to the GNU project by Niels Möller and Torbjörn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. + IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS + ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2009, 2011, 2012, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* Computes a binary quotient of size qn = un - dn. + Output: + + Q = -U * D^{-1} mod B^qn, + + R = (U + Q * D) * B^(-qn) + + Stores the dn least significant limbs of R at {up + un - dn, dn}, + and returns the carry from the addition N + Q*D. + + D must be odd. dinv is (-D)^-1 mod B. */ + +mp_limb_t +mpn_sbpi1_bdiv_qr (mp_ptr qp, + mp_ptr up, mp_size_t un, + mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) +{ + mp_size_t i; + mp_limb_t cy; + + ASSERT (dn > 0); + ASSERT (un > dn); + ASSERT ((dp[0] & 1) != 0); + ASSERT (-(dp[0] * dinv) == 1); + ASSERT (up == qp || !MPN_OVERLAP_P (up, un, qp, un - dn)); + + for (i = un - dn, cy = 0; i != 0; i--) + { + mp_limb_t q = dinv * up[0]; + mp_limb_t hi = mpn_addmul_1 (up, dp, dn, q); + *qp++ = q; + + hi += cy; + cy = hi < cy; + hi += up[dn]; + cy += hi < up[dn]; + up[dn] = hi; + up++; + } + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/sbpi1_bdiv_r.c b/gmp-6.3.0/mpn/generic/sbpi1_bdiv_r.c new file mode 100644 index 0000000..a609951 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sbpi1_bdiv_r.c @@ -0,0 +1,79 @@ +/* mpn_sbpi1_bdiv_r -- schoolbook Hensel division with precomputed inverse, + returning remainder. + + Contributed to the GNU project by Niels Möller and Torbjörn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL FUNCTIONS WITH MUTABLE INTERFACES. + IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS + ALMOST GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2006, 2009, 2011, 2012, 2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* Computes a binary quotient of size qn = un - dn. + Output: + + Q = -U * D^{-1} mod B^qn, + + R = (U + Q * D) * B^(-qn) + + Stores the dn least significant limbs of R at {up + un - dn, dn}, + and returns the carry from the addition N + Q*D. + + D must be odd. dinv is (-D)^-1 mod B. */ + +mp_limb_t +mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un, + mp_srcptr dp, mp_size_t dn, mp_limb_t dinv) +{ + mp_size_t i; + mp_limb_t cy; + + ASSERT (dn > 0); + ASSERT (un > dn); + ASSERT ((dp[0] & 1) != 0); + ASSERT (-(dp[0] * dinv) == 1); + + for (i = un - dn, cy = 0; i != 0; i--) + { + mp_limb_t q = dinv * up[0]; + mp_limb_t hi = mpn_addmul_1 (up, dp, dn, q); + + hi += cy; + cy = hi < cy; + hi += up[dn]; + cy += hi < up[dn]; + up[dn] = hi; + up++; + } + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/sbpi1_div_q.c b/gmp-6.3.0/mpn/generic/sbpi1_div_q.c new file mode 100644 index 0000000..a9975eb --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sbpi1_div_q.c @@ -0,0 +1,302 @@ +/* mpn_sbpi1_div_q -- Schoolbook division using the Möller-Granlund 3/2 + division algorithm. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2007, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_sbpi1_div_q (mp_ptr qp, + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_limb_t dinv) +{ + mp_limb_t qh; + mp_size_t qn, i; + mp_limb_t n1, n0; + mp_limb_t d1, d0; + mp_limb_t cy, cy1; + mp_limb_t q; + mp_limb_t flag; + + mp_size_t dn_orig = dn; + mp_srcptr dp_orig = dp; + mp_ptr np_orig = np; + + ASSERT (dn > 2); + ASSERT (nn >= dn); + ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0); + + np += nn; + + qn = nn - dn; + if (qn + 1 < dn) + { + dp += dn - (qn + 1); + dn = qn + 1; + } + + qh = mpn_cmp (np - dn, dp, dn) >= 0; + if (qh != 0) + mpn_sub_n (np - dn, np - dn, dp, dn); + + qp += qn; + + dn -= 2; /* offset dn by 2 for main division loops, + saving two iterations in mpn_submul_1. */ + d1 = dp[dn + 1]; + d0 = dp[dn + 0]; + + np -= 2; + + n1 = np[1]; + + for (i = qn - (dn + 2); i >= 0; i--) + { + np--; + if (UNLIKELY (n1 == d1) && np[1] == d0) + { + q = GMP_NUMB_MASK; + mpn_submul_1 (np - dn, dp, dn + 2, q); + n1 = np[1]; /* update n1, last loop's value will now be invalid */ + } + else + { + udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); + + cy = mpn_submul_1 (np - dn, dp, dn, q); + + cy1 = n0 < cy; + n0 = (n0 - cy) & GMP_NUMB_MASK; + cy = n1 < cy1; + n1 -= cy1; + np[0] = n0; + + if (UNLIKELY (cy != 0)) + { + n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1); + q--; + } + } + + *--qp = q; + } + + flag = ~CNST_LIMB(0); + + if (dn >= 0) + { + for (i = dn; i > 0; i--) + { + np--; + if (UNLIKELY (n1 >= (d1 & flag))) + { + q = GMP_NUMB_MASK; + cy = mpn_submul_1 (np - dn, dp, dn + 2, q); + + if (UNLIKELY (n1 != cy)) + { + if (n1 < (cy & flag)) + { + q--; + mpn_add_n (np - dn, np - dn, dp, dn + 2); + } + else + flag = 0; + } + n1 = np[1]; + } + else + { + udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); + + cy = mpn_submul_1 (np - dn, dp, dn, q); + + cy1 = n0 < cy; + n0 = (n0 - cy) & GMP_NUMB_MASK; + cy = n1 < cy1; + n1 -= cy1; + np[0] = n0; + + if (UNLIKELY (cy != 0)) + { + n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1); + q--; + } + } + + *--qp = q; + + /* Truncate operands. */ + dn--; + dp++; + } + + np--; + if (UNLIKELY (n1 >= (d1 & flag))) + { + q = GMP_NUMB_MASK; + cy = mpn_submul_1 (np, dp, 2, q); + + if (UNLIKELY (n1 != cy)) + { + if (n1 < (cy & flag)) + { + q--; + add_ssaaaa (np[1], np[0], np[1], np[0], dp[1], dp[0]); + } + else + flag = 0; + } + n1 = np[1]; + } + else + { + udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); + + np[0] = n0; + np[1] = n1; + } + + *--qp = q; + } + ASSERT_ALWAYS (np[1] == n1); + np += 2; + + + dn = dn_orig; + if (UNLIKELY (n1 < (dn & flag))) + { + mp_limb_t q, x; + + /* The quotient may be too large if the remainder is small. Recompute + for above ignored operand parts, until the remainder spills. + + FIXME: The quality of this code isn't the same as the code above. + 1. We don't compute things in an optimal order, high-to-low, in order + to terminate as quickly as possible. + 2. We mess with pointers and sizes, adding and subtracting and + adjusting to get things right. It surely could be streamlined. + 3. The only termination criteria are that we determine that the + quotient needs to be adjusted, or that we have recomputed + everything. We should stop when the remainder is so large + that no additional subtracting could make it spill. + 4. If nothing else, we should not do two loops of submul_1 over the + data, instead handle both the triangularization and chopping at + once. */ + + x = n1; + + if (dn > 2) + { + /* Compensate for triangularization. */ + mp_limb_t y; + + dp = dp_orig; + if (qn + 1 < dn) + { + dp += dn - (qn + 1); + dn = qn + 1; + } + + y = np[-2]; + + for (i = dn - 3; i >= 0; i--) + { + q = qp[i]; + cy = mpn_submul_1 (np - (dn - i), dp, dn - i - 2, q); + + if (y < cy) + { + if (x == 0) + { + cy = mpn_sub_1 (qp, qp, qn, 1); + ASSERT_ALWAYS (cy == 0); + return qh - cy; + } + x--; + } + y -= cy; + } + np[-2] = y; + } + + dn = dn_orig; + if (qn + 1 < dn) + { + /* Compensate for ignored dividend and divisor tails. */ + + dp = dp_orig; + np = np_orig; + + if (qh != 0) + { + cy = mpn_sub_n (np + qn, np + qn, dp, dn - (qn + 1)); + if (cy != 0) + { + if (x == 0) + { + if (qn != 0) + cy = mpn_sub_1 (qp, qp, qn, 1); + return qh - cy; + } + x--; + } + } + + if (qn == 0) + return qh; + + for (i = dn - qn - 2; i >= 0; i--) + { + cy = mpn_submul_1 (np + i, qp, qn, dp[i]); + cy = mpn_sub_1 (np + qn + i, np + qn + i, dn - qn - i - 1, cy); + if (cy != 0) + { + if (x == 0) + { + cy = mpn_sub_1 (qp, qp, qn, 1); + return qh; + } + x--; + } + } + } + } + + return qh; +} diff --git a/gmp-6.3.0/mpn/generic/sbpi1_div_qr.c b/gmp-6.3.0/mpn/generic/sbpi1_div_qr.c new file mode 100644 index 0000000..7330a77 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sbpi1_div_qr.c @@ -0,0 +1,109 @@ +/* mpn_sbpi1_div_qr -- Schoolbook division using the Möller-Granlund 3/2 + division algorithm. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2007, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_sbpi1_div_qr (mp_ptr qp, + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_limb_t dinv) +{ + mp_limb_t qh; + mp_size_t i; + mp_limb_t n1, n0; + mp_limb_t d1, d0; + mp_limb_t cy, cy1; + mp_limb_t q; + + ASSERT (dn > 2); + ASSERT (nn >= dn); + ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0); + + np += nn; + + qh = mpn_cmp (np - dn, dp, dn) >= 0; + if (qh != 0) + mpn_sub_n (np - dn, np - dn, dp, dn); + + qp += nn - dn; + + dn -= 2; /* offset dn by 2 for main division loops, + saving two iterations in mpn_submul_1. */ + d1 = dp[dn + 1]; + d0 = dp[dn + 0]; + + np -= 2; + + n1 = np[1]; + + for (i = nn - (dn + 2); i > 0; i--) + { + np--; + if (UNLIKELY (n1 == d1) && np[1] == d0) + { + q = GMP_NUMB_MASK; + mpn_submul_1 (np - dn, dp, dn + 2, q); + n1 = np[1]; /* update n1, last loop's value will now be invalid */ + } + else + { + udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); + + cy = mpn_submul_1 (np - dn, dp, dn, q); + + cy1 = n0 < cy; + n0 = (n0 - cy) & GMP_NUMB_MASK; + cy = n1 < cy1; + n1 = (n1 - cy1) & GMP_NUMB_MASK; + np[0] = n0; + + if (UNLIKELY (cy != 0)) + { + n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1); + q--; + } + } + + *--qp = q; + } + np[1] = n1; + + return qh; +} diff --git a/gmp-6.3.0/mpn/generic/sbpi1_divappr_q.c b/gmp-6.3.0/mpn/generic/sbpi1_divappr_q.c new file mode 100644 index 0000000..ef7ca26 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sbpi1_divappr_q.c @@ -0,0 +1,198 @@ +/* mpn_sbpi1_divappr_q -- Schoolbook division using the Möller-Granlund 3/2 + division algorithm, returning approximate quotient. The quotient returned + is either correct, or one too large. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE. + +Copyright 2007, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_sbpi1_divappr_q (mp_ptr qp, + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_limb_t dinv) +{ + mp_limb_t qh; + mp_size_t qn, i; + mp_limb_t n1, n0; + mp_limb_t d1, d0; + mp_limb_t cy, cy1; + mp_limb_t q; + mp_limb_t flag; + + ASSERT (dn > 2); + ASSERT (nn >= dn); + ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0); + + np += nn; + + qn = nn - dn; + if (qn + 1 < dn) + { + dp += dn - (qn + 1); + dn = qn + 1; + } + + qh = mpn_cmp (np - dn, dp, dn) >= 0; + if (qh != 0) + mpn_sub_n (np - dn, np - dn, dp, dn); + + qp += qn; + + dn -= 2; /* offset dn by 2 for main division loops, + saving two iterations in mpn_submul_1. */ + d1 = dp[dn + 1]; + d0 = dp[dn + 0]; + + np -= 2; + + n1 = np[1]; + + for (i = qn - (dn + 2); i >= 0; i--) + { + np--; + if (UNLIKELY (n1 == d1) && np[1] == d0) + { + q = GMP_NUMB_MASK; + mpn_submul_1 (np - dn, dp, dn + 2, q); + n1 = np[1]; /* update n1, last loop's value will now be invalid */ + } + else + { + udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); + + cy = mpn_submul_1 (np - dn, dp, dn, q); + + cy1 = n0 < cy; + n0 = (n0 - cy) & GMP_NUMB_MASK; + cy = n1 < cy1; + n1 -= cy1; + np[0] = n0; + + if (UNLIKELY (cy != 0)) + { + n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1); + q--; + } + } + + *--qp = q; + } + + flag = ~CNST_LIMB(0); + + if (dn >= 0) + { + for (i = dn; i > 0; i--) + { + np--; + if (UNLIKELY (n1 >= (d1 & flag))) + { + q = GMP_NUMB_MASK; + cy = mpn_submul_1 (np - dn, dp, dn + 2, q); + + if (UNLIKELY (n1 != cy)) + { + if (n1 < (cy & flag)) + { + q--; + mpn_add_n (np - dn, np - dn, dp, dn + 2); + } + else + flag = 0; + } + n1 = np[1]; + } + else + { + udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); + + cy = mpn_submul_1 (np - dn, dp, dn, q); + + cy1 = n0 < cy; + n0 = (n0 - cy) & GMP_NUMB_MASK; + cy = n1 < cy1; + n1 -= cy1; + np[0] = n0; + + if (UNLIKELY (cy != 0)) + { + n1 += d1 + mpn_add_n (np - dn, np - dn, dp, dn + 1); + q--; + } + } + + *--qp = q; + + /* Truncate operands. */ + dn--; + dp++; + } + + np--; + if (UNLIKELY (n1 >= (d1 & flag))) + { + q = GMP_NUMB_MASK; + cy = mpn_submul_1 (np, dp, 2, q); + + if (UNLIKELY (n1 != cy)) + { + if (n1 < (cy & flag)) + { + q--; + add_ssaaaa (np[1], np[0], np[1], np[0], dp[1], dp[0]); + } + else + flag = 0; + } + n1 = np[1]; + } + else + { + udiv_qr_3by2 (q, n1, n0, n1, np[1], np[0], d1, d0, dinv); + + np[1] = n1; + np[0] = n0; + } + + *--qp = q; + } + + ASSERT_ALWAYS (np[1] == n1); + + return qh; +} diff --git a/gmp-6.3.0/mpn/generic/scan0.c b/gmp-6.3.0/mpn/generic/scan0.c new file mode 100644 index 0000000..d71832e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/scan0.c @@ -0,0 +1,59 @@ +/* mpn_scan0 -- Scan from a given bit position for the next clear bit. + +Copyright 1994, 1996, 2001, 2002, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Argument constraints: + 1. U must sooner or later have a limb with a clear bit. + */ + +mp_bitcnt_t +mpn_scan0 (mp_srcptr up, mp_bitcnt_t starting_bit) +{ + mp_size_t starting_word; + mp_limb_t alimb; + int cnt; + mp_srcptr p; + + /* Start at the word implied by STARTING_BIT. */ + starting_word = starting_bit / GMP_NUMB_BITS; + p = up + starting_word; + alimb = *p++ ^ GMP_NUMB_MASK; + + /* Mask off any bits before STARTING_BIT in the first limb. */ + alimb &= - (mp_limb_t) 1 << (starting_bit % GMP_NUMB_BITS); + + while (alimb == 0) + alimb = *p++ ^ GMP_NUMB_MASK; + + count_trailing_zeros (cnt, alimb); + return (p - up - 1) * GMP_NUMB_BITS + cnt; +} diff --git a/gmp-6.3.0/mpn/generic/scan1.c b/gmp-6.3.0/mpn/generic/scan1.c new file mode 100644 index 0000000..09e8060 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/scan1.c @@ -0,0 +1,59 @@ +/* mpn_scan1 -- Scan from a given bit position for the next set bit. + +Copyright 1994, 1996, 2001, 2002, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* Argument constraints: + 1. U must sooner or later have a limb != 0. + */ + +mp_bitcnt_t +mpn_scan1 (mp_srcptr up, mp_bitcnt_t starting_bit) +{ + mp_size_t starting_word; + mp_limb_t alimb; + int cnt; + mp_srcptr p; + + /* Start at the word implied by STARTING_BIT. */ + starting_word = starting_bit / GMP_NUMB_BITS; + p = up + starting_word; + alimb = *p++; + + /* Mask off any bits before STARTING_BIT in the first limb. */ + alimb &= - (mp_limb_t) 1 << (starting_bit % GMP_NUMB_BITS); + + while (alimb == 0) + alimb = *p++; + + count_trailing_zeros (cnt, alimb); + return (p - up - 1) * GMP_NUMB_BITS + cnt; +} diff --git a/gmp-6.3.0/mpn/generic/sec_aors_1.c b/gmp-6.3.0/mpn/generic/sec_aors_1.c new file mode 100644 index 0000000..6480fa1 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sec_aors_1.c @@ -0,0 +1,59 @@ +/* mpn_sec_add_1, mpn_sec_sub_1 + + Contributed to the GNU project by Niels Möller + +Copyright 2013, 2014 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#if OPERATION_sec_add_1 +#define FNAME mpn_sec_add_1 +#define FNAME_itch mpn_sec_add_1_itch +#define OP_N mpn_add_n +#endif +#if OPERATION_sec_sub_1 +#define FNAME mpn_sec_sub_1 +#define FNAME_itch mpn_sec_sub_1_itch +#define OP_N mpn_sub_n +#endif + +/* It's annoying to that we need scratch space */ +mp_size_t +FNAME_itch (mp_size_t n) +{ + return n; +} + +mp_limb_t +FNAME (mp_ptr rp, mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_ptr scratch) +{ + scratch[0] = b; + MPN_ZERO (scratch + 1, n-1); + return OP_N (rp, ap, scratch, n); +} diff --git a/gmp-6.3.0/mpn/generic/sec_div.c b/gmp-6.3.0/mpn/generic/sec_div.c new file mode 100644 index 0000000..1f08649 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sec_div.c @@ -0,0 +1,131 @@ +/* mpn_sec_div_qr, mpn_sec_div_r -- Compute Q = floor(U / V), U = U mod V. + Side-channel silent under the assumption that the used instructions are + side-channel silent. + + Contributed to the GNU project by Torbjörn Granlund. + +Copyright 2011-2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#if OPERATION_sec_div_qr +#define FNAME mpn_sec_div_qr +#define FNAME_itch mpn_sec_div_qr_itch +#define Q(q) q, +#define RETTYPE mp_limb_t +#endif +#if OPERATION_sec_div_r +#define FNAME mpn_sec_div_r +#define FNAME_itch mpn_sec_div_r_itch +#define Q(q) +#define RETTYPE void +#endif + +mp_size_t +FNAME_itch (mp_size_t nn, mp_size_t dn) +{ +#if OPERATION_sec_div_qr +/* Needs (nn + dn + 1) + mpn_sec_pi1_div_qr's needs of (2nn' - dn + 1) for a + total of 3nn + 4 limbs at tp. Note that mpn_sec_pi1_div_qr's nn is one + greater than ours, therefore +4 and not just +2. */ + return 3 * nn + 4; +#endif +#if OPERATION_sec_div_r +/* Needs (nn + dn + 1) + mpn_sec_pi1_div_r's needs of (dn + 1) for a total of + nn + 2dn + 2 limbs at tp. */ + return nn + 2 * dn + 2; +#endif +} + +RETTYPE +FNAME (Q(mp_ptr qp) + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_ptr tp) +{ + mp_limb_t d1, d0; + unsigned int cnt; + mp_limb_t inv32; + + ASSERT (dn >= 1); + ASSERT (nn >= dn); + ASSERT (dp[dn - 1] != 0); + + d1 = dp[dn - 1]; + count_leading_zeros (cnt, d1); + + if (cnt != 0) + { + mp_limb_t qh, cy; + mp_ptr np2, dp2; + dp2 = tp; /* dn limbs */ + mpn_lshift (dp2, dp, dn, cnt); + + np2 = tp + dn; /* (nn + 1) limbs */ + cy = mpn_lshift (np2, np, nn, cnt); + np2[nn++] = cy; + + d0 = dp2[dn - 1]; + d0 += (~d0 != 0); + invert_limb (inv32, d0); + + /* We add nn + dn to tp here, not nn + 1 + dn, as expected. This is + since nn here will have been incremented. */ +#if OPERATION_sec_div_qr + qh = mpn_sec_pi1_div_qr (np2 + dn, np2, nn, dp2, dn, inv32, tp + nn + dn); + ASSERT (qh == 0); /* FIXME: this indicates inefficiency! */ + MPN_COPY (qp, np2 + dn, nn - dn - 1); + qh = np2[nn - 1]; +#else + mpn_sec_pi1_div_r (np2, nn, dp2, dn, inv32, tp + nn + dn); +#endif + + mpn_rshift (np, np2, dn, cnt); + +#if OPERATION_sec_div_qr + return qh; +#endif + } + else + { + /* FIXME: Consider copying np => np2 here, adding a 0-limb at the top. + That would simplify the underlying pi1 function, since then it could + assume nn > dn. */ + d0 = dp[dn - 1]; + d0 += (~d0 != 0); + invert_limb (inv32, d0); + +#if OPERATION_sec_div_qr + return mpn_sec_pi1_div_qr (qp, np, nn, dp, dn, inv32, tp); +#else + mpn_sec_pi1_div_r (np, nn, dp, dn, inv32, tp); +#endif + } +} diff --git a/gmp-6.3.0/mpn/generic/sec_invert.c b/gmp-6.3.0/mpn/generic/sec_invert.c new file mode 100644 index 0000000..07665d1 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sec_invert.c @@ -0,0 +1,177 @@ +/* mpn_sec_invert + + Contributed to the GNU project by Niels Möller + +Copyright 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#if 0 +/* Currently unused. Should be resurrected once mpn_cnd_neg is + advertised. */ +static mp_size_t +mpn_cnd_neg_itch (mp_size_t n) +{ + return n; +} +#endif + +/* FIXME: Ought to return carry */ +static void +mpn_cnd_neg (int cnd, mp_limb_t *rp, const mp_limb_t *ap, mp_size_t n, + mp_ptr scratch) +{ + mpn_lshift (scratch, ap, n, 1); + mpn_cnd_sub_n (cnd, rp, ap, scratch, n); +} + +static int +mpn_sec_eq_ui (mp_srcptr ap, mp_size_t n, mp_limb_t b) +{ + mp_limb_t d; + ASSERT (n > 0); + + d = ap[0] ^ b; + + while (--n > 0) + d |= ap[n]; + + return d == 0; +} + +mp_size_t +mpn_sec_invert_itch (mp_size_t n) +{ + return 4*n; +} + +/* Compute V <-- A^{-1} (mod M), in data-independent time. M must be + odd. Returns 1 on success, and 0 on failure (i.e., if gcd (A, m) != + 1). Inputs and outputs of size n, and no overlap allowed. The {ap, + n} area is destroyed. For arbitrary inputs, bit_size should be + 2*n*GMP_NUMB_BITS, but if A or M are known to be smaller, e.g., if + M = 2^521 - 1 and A < M, bit_size can be any bound on the sum of + the bit sizes of A and M. */ +int +mpn_sec_invert (mp_ptr vp, mp_ptr ap, mp_srcptr mp, + mp_size_t n, mp_bitcnt_t bit_size, + mp_ptr scratch) +{ + ASSERT (n > 0); + ASSERT (bit_size > 0); + ASSERT (mp[0] & 1); + ASSERT (! MPN_OVERLAP_P (ap, n, vp, n)); +#define bp (scratch + n) +#define up (scratch + 2*n) +#define m1hp (scratch + 3*n) + + /* Maintain + + a = u * orig_a (mod m) + b = v * orig_a (mod m) + + and b odd at all times. Initially, + + a = a_orig, u = 1 + b = m, v = 0 + */ + + + up[0] = 1; + mpn_zero (up+1, n - 1); + mpn_copyi (bp, mp, n); + mpn_zero (vp, n); + + ASSERT_CARRY (mpn_rshift (m1hp, mp, n, 1)); + ASSERT_NOCARRY (mpn_sec_add_1 (m1hp, m1hp, n, 1, scratch)); + + while (bit_size-- > 0) + { + mp_limb_t odd, swap, cy; + + /* Always maintain b odd. The logic of the iteration is as + follows. For a, b: + + odd = a & 1 + a -= odd * b + if (underflow from a-b) + { + b += a, assigns old a + a = B^n-a + } + + a /= 2 + + For u, v: + + if (underflow from a - b) + swap u, v + u -= odd * v + if (underflow from u - v) + u += m + + u /= 2 + if (a one bit was shifted out) + u += (m+1)/2 + + As long as a > 0, the quantity + + (bitsize of a) + (bitsize of b) + + is reduced by at least one bit per iteration, hence after (bit_size of + orig_a) + (bit_size of m) - 1 iterations we surely have a = 0. Then b + = gcd(orig_a, m) and if b = 1 then also v = orig_a^{-1} (mod m). + */ + + ASSERT (bp[0] & 1); + odd = ap[0] & 1; + + swap = mpn_cnd_sub_n (odd, ap, ap, bp, n); + mpn_cnd_add_n (swap, bp, bp, ap, n); + mpn_cnd_neg (swap, ap, ap, n, scratch); + + mpn_cnd_swap (swap, up, vp, n); + cy = mpn_cnd_sub_n (odd, up, up, vp, n); + cy -= mpn_cnd_add_n (cy, up, up, mp, n); + ASSERT (cy == 0); + + cy = mpn_rshift (ap, ap, n, 1); + ASSERT (cy == 0); + cy = mpn_rshift (up, up, n, 1); + cy = mpn_cnd_add_n (cy, up, up, m1hp, n); + ASSERT (cy == 0); + } + /* Should be all zeros, but check only extreme limbs */ + ASSERT ( (ap[0] | ap[n-1]) == 0); + /* Check if indeed gcd == 1. */ + return mpn_sec_eq_ui (bp, n, 1); +#undef bp +#undef up +#undef m1hp +} diff --git a/gmp-6.3.0/mpn/generic/sec_mul.c b/gmp-6.3.0/mpn/generic/sec_mul.c new file mode 100644 index 0000000..4bbfa61 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sec_mul.c @@ -0,0 +1,48 @@ +/* mpn_sec_mul. + + Contributed to the GNU project by Torbjörn Granlund. + +Copyright 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +void +mpn_sec_mul (mp_ptr rp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, + mp_ptr tp) +{ + mpn_mul_basecase (rp, ap, an, bp, bn); +} + +mp_size_t +mpn_sec_mul_itch (mp_size_t an, mp_size_t bn) +{ + return 0; +} diff --git a/gmp-6.3.0/mpn/generic/sec_pi1_div.c b/gmp-6.3.0/mpn/generic/sec_pi1_div.c new file mode 100644 index 0000000..29d01e7 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sec_pi1_div.c @@ -0,0 +1,172 @@ +/* mpn_sec_pi1_div_qr, mpn_sec_pi1_div_r -- Compute Q = floor(U / V), U = U + mod V. Side-channel silent under the assumption that the used instructions + are side-channel silent. + + Contributed to the GNU project by Torbjörn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011-2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +/* This side-channel silent division algorithm reduces the partial remainder by + GMP_NUMB_BITS/2 bits at a time, compared to GMP_NUMB_BITS for the main + division algorithm. We actually do not insist on reducing by exactly + GMP_NUMB_BITS/2, but may leave a partial remainder that is D*B^i to 3D*B^i + too large (B is the limb base, D is the divisor, and i is the induction + variable); the subsequent step will handle the extra partial remainder bits. + + With that partial remainder reduction, each step generates a quotient "half + limb". The outer loop generates two quotient half limbs, an upper (q1h) and + a lower (q0h) which are stored sparsely in separate limb arrays. These + arrays are added at the end; using separate arrays avoids data-dependent + carry propagation which could else pose a side-channel leakage problem. + + The quotient half limbs may be between -3 to 0 from the accurate value + ("accurate" being the one which corresponds to a reduction to a principal + partial remainder). Too small quotient half limbs correspond to too large + remainders, which we reduce later, as described above. + + In order to keep quotients from getting too big, corresponding to a negative + partial remainder, we use an inverse which is slightly smaller than usually. +*/ + +#if OPERATION_sec_pi1_div_qr +/* Needs (dn + 1) + (nn - dn) + (nn - dn) = 2nn - dn + 1 limbs at tp. */ +#define FNAME mpn_sec_pi1_div_qr +#define Q(q) q, +#define RETTYPE mp_limb_t +#endif +#if OPERATION_sec_pi1_div_r +/* Needs (dn + 1) limbs at tp. */ +#define FNAME mpn_sec_pi1_div_r +#define Q(q) +#define RETTYPE void +#endif + +RETTYPE +FNAME (Q(mp_ptr qp) + mp_ptr np, mp_size_t nn, + mp_srcptr dp, mp_size_t dn, + mp_limb_t dinv, + mp_ptr tp) +{ + mp_limb_t nh, cy, q1h, q0h, dummy, cnd; + mp_size_t i; + mp_ptr hp; +#if OPERATION_sec_pi1_div_qr + mp_limb_t qh; + mp_ptr qlp, qhp; +#endif + + ASSERT (dn >= 1); + ASSERT (nn >= dn); + ASSERT ((dp[dn - 1] & GMP_NUMB_HIGHBIT) != 0); + + if (nn == dn) + { + cy = mpn_sub_n (np, np, dp, dn); + mpn_cnd_add_n (cy, np, np, dp, dn); +#if OPERATION_sec_pi1_div_qr + return 1 - cy; +#else + return; +#endif + } + + /* Create a divisor copy shifted half a limb. */ + hp = tp; /* (dn + 1) limbs */ + hp[dn] = mpn_lshift (hp, dp, dn, GMP_NUMB_BITS / 2); + +#if OPERATION_sec_pi1_div_qr + qlp = tp + (dn + 1); /* (nn - dn) limbs */ + qhp = tp + (nn + 1); /* (nn - dn) limbs */ +#endif + + np += nn - dn; + nh = 0; + + for (i = nn - dn - 1; i >= 0; i--) + { + np--; + + nh = (nh << GMP_NUMB_BITS/2) + (np[dn] >> GMP_NUMB_BITS/2); + umul_ppmm (q1h, dummy, nh, dinv); + q1h += nh; +#if OPERATION_sec_pi1_div_qr + qhp[i] = q1h; +#endif + mpn_submul_1 (np, hp, dn + 1, q1h); + + nh = np[dn]; + umul_ppmm (q0h, dummy, nh, dinv); + q0h += nh; +#if OPERATION_sec_pi1_div_qr + qlp[i] = q0h; +#endif + nh -= mpn_submul_1 (np, dp, dn, q0h); + } + + /* 1st adjustment depends on extra high remainder limb. */ + cnd = nh != 0; /* FIXME: cmp-to-int */ +#if OPERATION_sec_pi1_div_qr + qlp[0] += cnd; +#endif + nh -= mpn_cnd_sub_n (cnd, np, np, dp, dn); + + /* 2nd adjustment depends on remainder/divisor comparison as well as whether + extra remainder limb was nullified by previous subtract. */ + cy = mpn_sub_n (np, np, dp, dn); + cy = cy - nh; +#if OPERATION_sec_pi1_div_qr + qlp[0] += 1 - cy; +#endif + mpn_cnd_add_n (cy, np, np, dp, dn); + + /* 3rd adjustment depends on remainder/divisor comparison. */ + cy = mpn_sub_n (np, np, dp, dn); +#if OPERATION_sec_pi1_div_qr + qlp[0] += 1 - cy; +#endif + mpn_cnd_add_n (cy, np, np, dp, dn); + +#if OPERATION_sec_pi1_div_qr + /* Combine quotient halves into final quotient. */ + qh = mpn_lshift (qhp, qhp, nn - dn, GMP_NUMB_BITS/2); + qh += mpn_add_n (qp, qhp, qlp, nn - dn); + + return qh; +#else + return; +#endif +} diff --git a/gmp-6.3.0/mpn/generic/sec_powm.c b/gmp-6.3.0/mpn/generic/sec_powm.c new file mode 100644 index 0000000..bba11cf --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sec_powm.c @@ -0,0 +1,430 @@ +/* mpn_sec_powm -- Compute R = U^E mod M. Secure variant, side-channel silent + under the assumption that the multiply instruction is side channel silent. + + Contributed to the GNU project by Torbjörn Granlund. + +Copyright 2007-2009, 2011-2014, 2018-2019, 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* + BASIC ALGORITHM, Compute U^E mod M, where M < B^n is odd. + + 1. T <- (B^n * U) mod M; convert to REDC form + + 2. Compute table U^0, U^1, U^2... of floor(log(E))-dependent size + + 3. While there are more bits in E + W <- power left-to-right base-k + + The article "Defeating modexp side-channel attacks with data-independent + execution traces", https://gmplib.org/~tege/modexp-silent.pdf, has details. + + + TODO: + + * Make getbits a macro, thereby allowing it to update the index operand. + That will simplify the code using getbits. (Perhaps make getbits' sibling + getbit then have similar form, for symmetry.) + + * Choose window size without looping. (Superoptimize or think(tm).) + + * REDC_1_TO_REDC_2_THRESHOLD might actually represent the cutoff between + redc_1 and redc_n. On such systems, we will switch to redc_2 causing + slowdown. +*/ + +#include "gmp-impl.h" +#include "longlong.h" + +#undef MPN_REDC_1_SEC +#if HAVE_NATIVE_mpn_sbpi1_bdiv_r +#define MPN_REDC_1_SEC(rp, up, mp, n, invm) \ + do { \ + mp_limb_t cy; \ + cy = mpn_sbpi1_bdiv_r (up, 2 * n, mp, n, invm); \ + mpn_cnd_sub_n (cy, rp, up + n, mp, n); \ + } while (0) +#else +#define MPN_REDC_1_SEC(rp, up, mp, n, invm) \ + do { \ + mp_limb_t cy; \ + cy = mpn_redc_1 (rp, up, mp, n, invm); \ + mpn_cnd_sub_n (cy, rp, rp, mp, n); \ + } while (0) +#endif + +#if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2 +#undef MPN_REDC_2_SEC +#define MPN_REDC_2_SEC(rp, up, mp, n, mip) \ + do { \ + mp_limb_t cy; \ + cy = mpn_redc_2 (rp, up, mp, n, mip); \ + mpn_cnd_sub_n (cy, rp, rp, mp, n); \ + } while (0) +#else +#define MPN_REDC_2_SEC(rp, up, mp, n, mip) /* empty */ +#undef REDC_1_TO_REDC_2_THRESHOLD +#define REDC_1_TO_REDC_2_THRESHOLD MP_SIZE_T_MAX +#endif + +/* Define our own mpn squaring function. We do this since we cannot use a + native mpn_sqr_basecase over TUNE_SQR_TOOM2_MAX, or a non-native one over + SQR_TOOM2_THRESHOLD. This is so because of fixed size stack allocations + made inside mpn_sqr_basecase. */ + +#if ! HAVE_NATIVE_mpn_sqr_basecase +/* The limit of the generic code is SQR_TOOM2_THRESHOLD. */ +#define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD +#endif + +#if HAVE_NATIVE_mpn_sqr_basecase +#ifdef TUNE_SQR_TOOM2_MAX +/* We slightly abuse TUNE_SQR_TOOM2_MAX here. If it is set for an assembly + mpn_sqr_basecase, it comes from SQR_TOOM2_THRESHOLD_MAX in the assembly + file. An assembly mpn_sqr_basecase that does not define it should allow + any size. */ +#define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD +#endif +#endif + +#ifdef WANT_FAT_BINARY +/* For fat builds, we use SQR_TOOM2_THRESHOLD which will expand to a read from + __gmpn_cpuvec. Perhaps any possible sqr_basecase.asm allow any size, and we + limit the use unnecessarily. We cannot tell, so play it safe. FIXME. */ +#define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD +#endif + +#ifndef SQR_BASECASE_LIM +/* If SQR_BASECASE_LIM is now not defined, use mpn_sqr_basecase for any operand + size. */ +#define SQR_BASECASE_LIM MP_SIZE_T_MAX +#endif + +#define mpn_local_sqr(rp,up,n) \ + do { \ + if (ABOVE_THRESHOLD (n, SQR_BASECASE_THRESHOLD) \ + && BELOW_THRESHOLD (n, SQR_BASECASE_LIM)) \ + mpn_sqr_basecase (rp, up, n); \ + else \ + mpn_mul_basecase(rp, up, n, up, n); \ + } while (0) + +#define getbit(p,bi) \ + ((p[(bi - 1) / GMP_NUMB_BITS] >> (bi - 1) % GMP_NUMB_BITS) & 1) + +/* FIXME: Maybe some things would get simpler if all callers ensure + that bi >= nbits. As far as I understand, with the current code bi + < nbits can happen only for the final iteration. */ +static inline mp_limb_t +getbits (const mp_limb_t *p, mp_bitcnt_t bi, int nbits) +{ + int nbits_in_r; + mp_limb_t r; + mp_size_t i; + + if (bi < nbits) + { + return p[0] & (((mp_limb_t) 1 << bi) - 1); + } + else + { + bi -= nbits; /* bit index of low bit to extract */ + i = bi / GMP_NUMB_BITS; /* word index of low bit to extract */ + bi %= GMP_NUMB_BITS; /* bit index in low word */ + r = p[i] >> bi; /* extract (low) bits */ + nbits_in_r = GMP_NUMB_BITS - bi; /* number of bits now in r */ + if (nbits_in_r < nbits) /* did we get enough bits? */ + r += p[i + 1] << nbits_in_r; /* prepend bits from higher word */ + return r & (((mp_limb_t ) 1 << nbits) - 1); + } +} + +#ifndef POWM_SEC_TABLE +#if GMP_NUMB_BITS < 50 +#define POWM_SEC_TABLE 2,33,96,780,2741 +#else +#define POWM_SEC_TABLE 2,130,524,2578 +#endif +#endif + +#if TUNE_PROGRAM_BUILD +extern int win_size (mp_bitcnt_t); +#else +static inline int +win_size (mp_bitcnt_t enb) +{ + int k; + /* Find k, such that x[k-1] < enb <= x[k]. + + We require that x[k] >= k, then it follows that enb > x[k-1] >= + k-1, which implies k <= enb. + */ + static const mp_bitcnt_t x[] = {POWM_SEC_TABLE,~(mp_bitcnt_t)0}; + for (k = 0; enb > x[k++]; ) + ; + ASSERT (k <= enb); + return k; +} +#endif + +/* Convert U to REDC form, U_r = B^n * U mod M. + Uses scratch space at tp of size 2un + n + 1. */ +static void +redcify (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr mp, mp_size_t n, mp_ptr tp) +{ + MPN_ZERO (tp, n); + MPN_COPY (tp + n, up, un); + + mpn_sec_div_r (tp, un + n, mp, n, tp + un + n); + MPN_COPY (rp, tp, n); +} + +static mp_limb_t +sec_binvert_limb (mp_limb_t n) +{ + mp_limb_t inv, t; + ASSERT ((n & 1) == 1); + /* 3 + 2 -> 5 */ + inv = n + (((n + 1) << 1) & 0x18); + + t = n * inv; +#if GMP_NUMB_BITS <= 10 + /* 5 x 2 -> 10 */ + inv = 2 * inv - inv * t; +#else /* GMP_NUMB_BITS > 10 */ + /* 5 x 2 + 2 -> 12 */ + inv = 2 * inv - inv * t + ((inv<<10)&-(t&(1<<5))); +#endif /* GMP_NUMB_BITS <= 10 */ + + if (GMP_NUMB_BITS > 12) + { + t = n * inv - 1; + if (GMP_NUMB_BITS <= 36) + { + /* 12 x 3 -> 36 */ + inv += inv * t * (t - 1); + } + else /* GMP_NUMB_BITS > 36 */ + { + mp_limb_t t2 = t * t; +#if GMP_NUMB_BITS <= 60 + /* 12 x 5 -> 60 */ + inv += inv * (t2 + 1) * (t2 - t); +#else /* GMP_NUMB_BITS > 60 */ + /* 12 x 5 + 4 -> 64 */ + inv *= (t2 + 1) * (t2 - t) + 1 - ((t<<48)&-(t&(1<<12))); + + /* 64 -> 128 -> 256 -> ... */ + for (int todo = (GMP_NUMB_BITS - 1) >> 6; todo != 0; todo >>= 1) + inv = 2 * inv - inv * inv * n; +#endif /* GMP_NUMB_BITS <= 60 */ + } + } + + ASSERT ((inv * n & GMP_NUMB_MASK) == 1); + return inv & GMP_NUMB_MASK; +} + +/* {rp, n} <-- {bp, bn} ^ {ep, en} mod {mp, n}, + where en = ceil (enb / GMP_NUMB_BITS) + Requires that {mp, n} is odd (and hence also mp[0] odd). + Uses scratch space at tp as defined by mpn_sec_powm_itch. */ +void +mpn_sec_powm (mp_ptr rp, mp_srcptr bp, mp_size_t bn, + mp_srcptr ep, mp_bitcnt_t enb, + mp_srcptr mp, mp_size_t n, mp_ptr tp) +{ + mp_limb_t ip[2], *mip; + int windowsize, this_windowsize; + mp_limb_t expbits; + mp_ptr pp, this_pp, ps; + long i; + int cnd; + + ASSERT (enb > 0); + ASSERT (n > 0); + /* The code works for bn = 0, but the defined scratch space is 2 limbs + greater than we supply, when converting 1 to redc form . */ + ASSERT (bn > 0); + ASSERT ((mp[0] & 1) != 0); + + windowsize = win_size (enb); + + mip = ip; + mip[0] = sec_binvert_limb (mp[0]); + if (ABOVE_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { + mp_limb_t t, dummy, mip0 = mip[0]; + + umul_ppmm (t, dummy, mip0, mp[0]); + ASSERT (dummy == 1); + t += mip0 * mp[1]; /* t = (mp * mip0)[1] */ + + mip[1] = t * mip0 - 1; /* ~( - t * mip0) */ + } + mip[0] = -mip[0]; + + pp = tp; + tp += (n << windowsize); /* put tp after power table */ + + /* Compute pp[0] table entry */ + /* scratch: | n | 1 | n+2 | */ + /* | pp[0] | 1 | redcify | */ + this_pp = pp; + this_pp[n] = 1; + redcify (this_pp, this_pp + n, 1, mp, n, this_pp + n + 1); + this_pp += n; + + /* Compute pp[1] table entry. To avoid excessive scratch usage in the + degenerate situation where B >> M, we let redcify use scratch space which + will later be used by the pp table (element 2 and up). */ + /* scratch: | n | n | bn + n + 1 | */ + /* | pp[0] | pp[1] | redcify | */ + redcify (this_pp, bp, bn, mp, n, this_pp + n); + + /* Precompute powers of b and put them in the temporary area at pp. */ + /* scratch: | n | n | ... | | 2n | */ + /* | pp[0] | pp[1] | ... | pp[2^windowsize-1] | product | */ + ps = pp + n; /* initially B^1 */ + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { + for (i = (1 << windowsize) - 2; i > 0; i -= 2) + { + mpn_local_sqr (tp, ps, n); + ps += n; + this_pp += n; + MPN_REDC_1_SEC (this_pp, tp, mp, n, mip[0]); + + mpn_mul_basecase (tp, this_pp, n, pp + n, n); + this_pp += n; + MPN_REDC_1_SEC (this_pp, tp, mp, n, mip[0]); + } + } + else + { + for (i = (1 << windowsize) - 2; i > 0; i -= 2) + { + mpn_local_sqr (tp, ps, n); + ps += n; + this_pp += n; + MPN_REDC_2_SEC (this_pp, tp, mp, n, mip); + + mpn_mul_basecase (tp, this_pp, n, pp + n, n); + this_pp += n; + MPN_REDC_2_SEC (this_pp, tp, mp, n, mip); + } + } + + expbits = getbits (ep, enb, windowsize); + ASSERT_ALWAYS (enb >= windowsize); + enb -= windowsize; + + mpn_sec_tabselect (rp, pp, n, 1 << windowsize, expbits); + + /* Main exponentiation loop. */ + /* scratch: | n | n | ... | | 3n-4n | */ + /* | pp[0] | pp[1] | ... | pp[2^windowsize-1] | loop scratch | */ + +#define INNERLOOP \ + while (enb != 0) \ + { \ + expbits = getbits (ep, enb, windowsize); \ + this_windowsize = windowsize; \ + if (enb < windowsize) \ + { \ + this_windowsize -= windowsize - enb; \ + enb = 0; \ + } \ + else \ + enb -= windowsize; \ + \ + do \ + { \ + mpn_local_sqr (tp, rp, n); \ + MPN_REDUCE (rp, tp, mp, n, mip); \ + this_windowsize--; \ + } \ + while (this_windowsize != 0); \ + \ + mpn_sec_tabselect (tp + 2*n, pp, n, 1 << windowsize, expbits); \ + mpn_mul_basecase (tp, rp, n, tp + 2*n, n); \ + \ + MPN_REDUCE (rp, tp, mp, n, mip); \ + } + + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + { +#undef MPN_REDUCE +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_1_SEC (rp, tp, mp, n, mip[0]) + INNERLOOP; + } + else + { +#undef MPN_REDUCE +#define MPN_REDUCE(rp,tp,mp,n,mip) MPN_REDC_2_SEC (rp, tp, mp, n, mip) + INNERLOOP; + } + + MPN_COPY (tp, rp, n); + MPN_ZERO (tp + n, n); + + if (BELOW_THRESHOLD (n, REDC_1_TO_REDC_2_THRESHOLD)) + MPN_REDC_1_SEC (rp, tp, mp, n, mip[0]); + else + MPN_REDC_2_SEC (rp, tp, mp, n, mip); + + cnd = mpn_sub_n (tp, rp, mp, n); /* we need just retval */ + mpn_cnd_sub_n (!cnd, rp, rp, mp, n); +} + +mp_size_t +mpn_sec_powm_itch (mp_size_t bn, mp_bitcnt_t enb, mp_size_t n) +{ + int windowsize; + mp_size_t redcify_itch, itch; + + /* FIXME: no more _local/_basecase difference. */ + /* The top scratch usage will either be when reducing B in the 2nd redcify + call, or more typically n*2^windowsize + 3n or 4n, in the main loop. (It + is 3n or 4n depending on if we use mpn_local_sqr or a native + mpn_sqr_basecase. We assume 4n always for now.) */ + + windowsize = win_size (enb); + + /* The 2n term is due to pp[0] and pp[1] at the time of the 2nd redcify call, + the (bn + n) term is due to redcify's own usage, and the rest is due to + mpn_sec_div_r's usage when called from redcify. */ + redcify_itch = (2 * n) + (bn + n) + ((bn + n) + 2 * n + 2); + + /* The n * 2^windowsize term is due to the power table, the 4n term is due to + scratch needs of squaring/multiplication in the exponentiation loop. */ + itch = (n << windowsize) + (4 * n); + + return MAX (itch, redcify_itch); +} diff --git a/gmp-6.3.0/mpn/generic/sec_sqr.c b/gmp-6.3.0/mpn/generic/sec_sqr.c new file mode 100644 index 0000000..83fc7d9 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sec_sqr.c @@ -0,0 +1,76 @@ +/* mpn_sec_sqr. + + Contributed to the GNU project by Torbjörn Granlund. + +Copyright 2013, 2014 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#if ! HAVE_NATIVE_mpn_sqr_basecase +/* The limit of the generic code is SQR_TOOM2_THRESHOLD. */ +#define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD +#endif + +#if HAVE_NATIVE_mpn_sqr_basecase +#ifdef TUNE_SQR_TOOM2_MAX +/* We slightly abuse TUNE_SQR_TOOM2_MAX here. If it is set for an assembly + mpn_sqr_basecase, it comes from SQR_TOOM2_THRESHOLD_MAX in the assembly + file. An assembly mpn_sqr_basecase that does not define it should allow + any size. */ +#define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD +#endif +#endif + +#ifdef WANT_FAT_BINARY +/* For fat builds, we use SQR_TOOM2_THRESHOLD which will expand to a read from + __gmpn_cpuvec. Perhaps any possible sqr_basecase.asm allow any size, and we + limit the use unnecessarily. We cannot tell, so play it safe. FIXME. */ +#define SQR_BASECASE_LIM SQR_TOOM2_THRESHOLD +#endif + +void +mpn_sec_sqr (mp_ptr rp, + mp_srcptr ap, mp_size_t an, + mp_ptr tp) +{ +#ifndef SQR_BASECASE_LIM +/* If SQR_BASECASE_LIM is now not defined, use mpn_sqr_basecase for any operand + size. */ + mpn_sqr_basecase (rp, ap, an); +#else +/* Else use mpn_mul_basecase. */ + mpn_mul_basecase (rp, ap, an, ap, an); +#endif +} + +mp_size_t +mpn_sec_sqr_itch (mp_size_t an) +{ + return 0; +} diff --git a/gmp-6.3.0/mpn/generic/sec_tabselect.c b/gmp-6.3.0/mpn/generic/sec_tabselect.c new file mode 100644 index 0000000..f50bdac --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sec_tabselect.c @@ -0,0 +1,134 @@ +/* mpn_sec_tabselect. + +Copyright 2007-2009, 2011, 2013, 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" + +#ifndef SEC_TABSELECT_METHOD +#define SEC_TABSELECT_METHOD 1 +#endif + +/* Select entry `which' from table `tab', which has nents entries, each `n' + limbs. Store the selected entry at rp. Reads entire table to avoid + side-channel information leaks. O(n*nents). */ + +#if SEC_TABSELECT_METHOD == 1 +void +mpn_sec_tabselect (volatile mp_limb_t *rp, volatile const mp_limb_t *tab, + mp_size_t n, mp_size_t nents, mp_size_t which) +{ + mp_size_t k, i; + mp_limb_t mask; + volatile const mp_limb_t *tp; + + tp = tab; + + /* Place first entry into result area. */ + for (i = 0; i < n; i++) + rp[i] = tp[i]; + + /* Conditionally replace entry in result area by entry 1...(nents-1) using + masking trickery. */ + for (k = 1; k < nents; k++) + { + /* Generate a mask using an expression which all compilers should compile + into branch-free code. The convoluted expression is designed to both + allow mp_limb_t greater and mp_limb_t smaller than mp_size_t. */ + mask = -(mp_limb_t) ((-(unsigned long) (which ^ k)) >> (BITS_PER_ULONG - 1)); + tp += n; + for (i = 0; i < n; i++) + rp[i] = (rp[i] & mask) | (tp[i] & ~mask); + } +} +#endif + +#if SEC_TABSELECT_METHOD == 2 +void +mpn_sec_tabselect (volatile mp_limb_t * restrict rp, + volatile const mp_limb_t * restrict tab, + mp_size_t n, mp_size_t nents, mp_size_t which) +{ + mp_size_t k, i; + mp_limb_t mask, r0, r1, r2, r3; + volatile const mp_limb_t * restrict tp; + + if (n & 1) + { + tp = tab; + r0 = 0; + for (k = 0; k < nents; k++) + { + mask = (mp_limb_t) ((-(unsigned long) (which ^ k)) >> (BITS_PER_ULONG - 1)) - 1; + r0 += tp[0] & mask; + tp += n; + } + rp[0] = r0; + rp += 1; + tab += 1; + } + + if (n & 2) + { + tp = tab; + r0 = r1 = 0; + for (k = 0; k < nents; k++) + { + mask = (mp_limb_t) ((-(unsigned long) (which ^ k)) >> (BITS_PER_ULONG - 1)) - 1; + r0 += tp[0] & mask; + r1 += tp[1] & mask; + tp += n; + } + rp[0] = r0; + rp[1] = r1; + rp += 2; + tab += 2; + } + + for (i = 0; i <= n - 4; i += 4) + { + tp = tab + i; + r0 = r1 = r2 = r3 = 0; + for (k = 0; k < nents; k++) + { + mask = (mp_limb_t) ((-(unsigned long) (which ^ k)) >> (BITS_PER_ULONG - 1)) - 1; + r0 += tp[0] & mask; + r1 += tp[1] & mask; + r2 += tp[2] & mask; + r3 += tp[3] & mask; + tp += n; + } + rp[0] = r0; + rp[1] = r1; + rp[2] = r2; + rp[3] = r3; + rp += 4; + } +} +#endif diff --git a/gmp-6.3.0/mpn/generic/set_str.c b/gmp-6.3.0/mpn/generic/set_str.c new file mode 100644 index 0000000..2bd584c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/set_str.c @@ -0,0 +1,290 @@ +/* mpn_set_str (mp_ptr res_ptr, const char *str, size_t str_len, int base) -- + Convert a STR_LEN long base BASE byte string pointed to by STR to a limb + vector pointed to by RES_PTR. Return the number of limbs in RES_PTR. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE, EXCEPT mpn_set_str, ARE INTERNAL WITH MUTABLE + INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. + IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A + FUTURE GNU MP RELEASE. + +Copyright 1991-2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* TODO: + + Perhaps do not compute the highest power? + Instead, multiply twice by the 2nd highest power: + + _______ + |_______| hp + |_______| pow + _______________ + |_______________| final result + + + _______ + |_______| hp + |___| pow[-1] + ___________ + |___________| intermediate result + |___| pow[-1] + _______________ + |_______________| final result + + Generalizing that idea, perhaps we should make powtab contain successive + cubes, not squares. +*/ + +#include "gmp-impl.h" + +mp_size_t +mpn_set_str (mp_ptr rp, const unsigned char *str, size_t str_len, int base) +{ + if (POW2_P (base)) + { + /* The base is a power of 2. Read the input string from least to most + significant character/digit. */ + + const unsigned char *s; + int next_bitpos; + mp_limb_t res_digit; + mp_size_t size; + int bits_per_indigit = mp_bases[base].big_base; + + size = 0; + res_digit = 0; + next_bitpos = 0; + + for (s = str + str_len - 1; s >= str; s--) + { + int inp_digit = *s; + + res_digit |= ((mp_limb_t) inp_digit << next_bitpos) & GMP_NUMB_MASK; + next_bitpos += bits_per_indigit; + if (next_bitpos >= GMP_NUMB_BITS) + { + rp[size++] = res_digit; + next_bitpos -= GMP_NUMB_BITS; + res_digit = inp_digit >> (bits_per_indigit - next_bitpos); + } + } + + if (res_digit != 0) + rp[size++] = res_digit; + return size; + } + + if (BELOW_THRESHOLD (str_len, SET_STR_PRECOMPUTE_THRESHOLD)) + return mpn_bc_set_str (rp, str, str_len, base); + else + { + mp_ptr powtab_mem, tp; + powers_t powtab[GMP_LIMB_BITS]; + int chars_per_limb; + mp_size_t size; + mp_size_t un; + TMP_DECL; + + TMP_MARK; + + chars_per_limb = mp_bases[base].chars_per_limb; + + un = str_len / chars_per_limb + 1; /* FIXME: scalar integer division */ + + /* Allocate one large block for the powers of big_base. */ + powtab_mem = TMP_BALLOC_LIMBS (mpn_str_powtab_alloc (un)); + + size_t n_pows = mpn_compute_powtab (powtab, powtab_mem, un, base); + powers_t *pt = powtab + n_pows; + + tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un)); + size = mpn_dc_set_str (rp, str, str_len, pt, tp); + + TMP_FREE; + return size; + } +} + +mp_size_t +mpn_dc_set_str (mp_ptr rp, const unsigned char *str, size_t str_len, + const powers_t *powtab, mp_ptr tp) +{ + size_t len_lo, len_hi; + mp_limb_t cy; + mp_size_t ln, hn, n, sn; + + len_lo = powtab->digits_in_base; + + if (str_len <= len_lo) + { + if (BELOW_THRESHOLD (str_len, SET_STR_DC_THRESHOLD)) + return mpn_bc_set_str (rp, str, str_len, powtab->base); + else + return mpn_dc_set_str (rp, str, str_len, powtab - 1, tp); + } + + len_hi = str_len - len_lo; + ASSERT (len_lo >= len_hi); + + if (BELOW_THRESHOLD (len_hi, SET_STR_DC_THRESHOLD)) + hn = mpn_bc_set_str (tp, str, len_hi, powtab->base); + else + hn = mpn_dc_set_str (tp, str, len_hi, powtab - 1, rp); + + sn = powtab->shift; + + if (hn == 0) + { + /* Zero +1 limb here, to avoid reading an allocated but uninitialised + limb in mpn_incr_u below. */ + MPN_ZERO (rp, powtab->n + sn + 1); + } + else + { + if (powtab->n > hn) + mpn_mul (rp + sn, powtab->p, powtab->n, tp, hn); + else + mpn_mul (rp + sn, tp, hn, powtab->p, powtab->n); + MPN_ZERO (rp, sn); + } + + str = str + str_len - len_lo; + if (BELOW_THRESHOLD (len_lo, SET_STR_DC_THRESHOLD)) + ln = mpn_bc_set_str (tp, str, len_lo, powtab->base); + else + ln = mpn_dc_set_str (tp, str, len_lo, powtab - 1, tp + powtab->n + sn + 1); + + if (ln != 0) + { + cy = mpn_add_n (rp, rp, tp, ln); + mpn_incr_u (rp + ln, cy); + } + n = hn + powtab->n + sn; + return n - (rp[n - 1] == 0); +} + +mp_size_t +mpn_bc_set_str (mp_ptr rp, const unsigned char *str, size_t str_len, int base) +{ + mp_size_t size; + size_t i; + long j; + mp_limb_t cy_limb; + + mp_limb_t big_base; + int chars_per_limb; + mp_limb_t res_digit; + + ASSERT (base >= 2); + ASSERT (base < numberof (mp_bases)); + ASSERT (str_len >= 1); + + big_base = mp_bases[base].big_base; + chars_per_limb = mp_bases[base].chars_per_limb; + + size = 0; + for (i = chars_per_limb; i < str_len; i += chars_per_limb) + { + res_digit = *str++; + if (base == 10) + { /* This is a common case. + Help the compiler to avoid multiplication. */ + for (j = MP_BASES_CHARS_PER_LIMB_10 - 1; j != 0; j--) + res_digit = res_digit * 10 + *str++; + } + else + { + for (j = chars_per_limb - 1; j != 0; j--) + res_digit = res_digit * base + *str++; + } + + if (size == 0) + { + if (res_digit != 0) + { + rp[0] = res_digit; + size = 1; + } + } + else + { +#if HAVE_NATIVE_mpn_mul_1c + cy_limb = mpn_mul_1c (rp, rp, size, big_base, res_digit); +#else + cy_limb = mpn_mul_1 (rp, rp, size, big_base); + cy_limb += mpn_add_1 (rp, rp, size, res_digit); +#endif + if (cy_limb != 0) + rp[size++] = cy_limb; + } + } + + big_base = base; + res_digit = *str++; + if (base == 10) + { /* This is a common case. + Help the compiler to avoid multiplication. */ + for (j = str_len - (i - MP_BASES_CHARS_PER_LIMB_10) - 1; j > 0; j--) + { + res_digit = res_digit * 10 + *str++; + big_base *= 10; + } + } + else + { + for (j = str_len - (i - chars_per_limb) - 1; j > 0; j--) + { + res_digit = res_digit * base + *str++; + big_base *= base; + } + } + + if (size == 0) + { + if (res_digit != 0) + { + rp[0] = res_digit; + size = 1; + } + } + else + { +#if HAVE_NATIVE_mpn_mul_1c + cy_limb = mpn_mul_1c (rp, rp, size, big_base, res_digit); +#else + cy_limb = mpn_mul_1 (rp, rp, size, big_base); + cy_limb += mpn_add_1 (rp, rp, size, res_digit); +#endif + if (cy_limb != 0) + rp[size++] = cy_limb; + } + return size; +} diff --git a/gmp-6.3.0/mpn/generic/sizeinbase.c b/gmp-6.3.0/mpn/generic/sizeinbase.c new file mode 100644 index 0000000..faee947 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sizeinbase.c @@ -0,0 +1,49 @@ +/* mpn_sizeinbase -- approximation to chars required for an mpn. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 1991, 1993-1995, 2001, 2002, 2011, 2012 Free Software Foundation, +Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +/* Same as mpz_sizeinbase, meaning exact for power-of-2 bases, and either + exact or 1 too big for other bases. */ + +size_t +mpn_sizeinbase (mp_srcptr xp, mp_size_t xsize, int base) +{ + size_t result; + MPN_SIZEINBASE (result, xp, xsize, base); + return result; +} diff --git a/gmp-6.3.0/mpn/generic/sqr.c b/gmp-6.3.0/mpn/generic/sqr.c new file mode 100644 index 0000000..74fbff0 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sqr.c @@ -0,0 +1,98 @@ +/* mpn_sqr -- square natural numbers. + +Copyright 1991, 1993, 1994, 1996-2003, 2005, 2008, 2009 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +void +mpn_sqr (mp_ptr p, mp_srcptr a, mp_size_t n) +{ + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (p, 2 * n, a, n)); + + if (BELOW_THRESHOLD (n, SQR_BASECASE_THRESHOLD)) + { /* mul_basecase is faster than sqr_basecase on small sizes sometimes */ + mpn_mul_basecase (p, a, n, a, n); + } + else if (BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) + { + mpn_sqr_basecase (p, a, n); + } + else if (BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) + { + /* Allocate workspace of fixed size on stack: fast! */ + mp_limb_t ws[mpn_toom2_sqr_itch (SQR_TOOM3_THRESHOLD_LIMIT-1)]; + ASSERT (SQR_TOOM3_THRESHOLD <= SQR_TOOM3_THRESHOLD_LIMIT); + mpn_toom2_sqr (p, a, n, ws); + } + else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) + { + mp_ptr ws; + TMP_SDECL; + TMP_SMARK; + ws = TMP_SALLOC_LIMBS (mpn_toom3_sqr_itch (n)); + mpn_toom3_sqr (p, a, n, ws); + TMP_SFREE; + } + else if (BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD)) + { + mp_ptr ws; + TMP_SDECL; + TMP_SMARK; + ws = TMP_SALLOC_LIMBS (mpn_toom4_sqr_itch (n)); + mpn_toom4_sqr (p, a, n, ws); + TMP_SFREE; + } + else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD)) + { + mp_ptr ws; + TMP_SDECL; + TMP_SMARK; + ws = TMP_SALLOC_LIMBS (mpn_toom6_sqr_itch (n)); + mpn_toom6_sqr (p, a, n, ws); + TMP_SFREE; + } + else if (BELOW_THRESHOLD (n, SQR_FFT_THRESHOLD)) + { + mp_ptr ws; + TMP_DECL; + TMP_MARK; + ws = TMP_ALLOC_LIMBS (mpn_toom8_sqr_itch (n)); + mpn_toom8_sqr (p, a, n, ws); + TMP_FREE; + } + else + { + /* The current FFT code allocates its own space. That should probably + change. */ + mpn_fft_mul (p, a, n, a, n); + } +} diff --git a/gmp-6.3.0/mpn/generic/sqr_basecase.c b/gmp-6.3.0/mpn/generic/sqr_basecase.c new file mode 100644 index 0000000..2645bad --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sqr_basecase.c @@ -0,0 +1,361 @@ +/* mpn_sqr_basecase -- Internal routine to square a natural number + of length n. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + + +Copyright 1991-1994, 1996, 1997, 2000-2005, 2008, 2010, 2011, 2017 Free +Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +#if HAVE_NATIVE_mpn_sqr_diagonal +#define MPN_SQR_DIAGONAL(rp, up, n) \ + mpn_sqr_diagonal (rp, up, n) +#else +#define MPN_SQR_DIAGONAL(rp, up, n) \ + do { \ + mp_size_t _i; \ + for (_i = 0; _i < (n); _i++) \ + { \ + mp_limb_t ul, lpl; \ + ul = (up)[_i]; \ + umul_ppmm ((rp)[2 * _i + 1], lpl, ul, ul << GMP_NAIL_BITS); \ + (rp)[2 * _i] = lpl >> GMP_NAIL_BITS; \ + } \ + } while (0) +#endif + +#if HAVE_NATIVE_mpn_sqr_diag_addlsh1 +#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n) \ + mpn_sqr_diag_addlsh1 (rp, tp, up, n) +#else +#if HAVE_NATIVE_mpn_addlsh1_n +#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n) \ + do { \ + mp_limb_t cy; \ + MPN_SQR_DIAGONAL (rp, up, n); \ + cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2); \ + rp[2 * n - 1] += cy; \ + } while (0) +#else +#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n) \ + do { \ + mp_limb_t cy; \ + MPN_SQR_DIAGONAL (rp, up, n); \ + cy = mpn_lshift (tp, tp, 2 * n - 2, 1); \ + cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2); \ + rp[2 * n - 1] += cy; \ + } while (0) +#endif +#endif + + +#undef READY_WITH_mpn_sqr_basecase + + +#if ! defined (READY_WITH_mpn_sqr_basecase) && HAVE_NATIVE_mpn_addmul_2s +void +mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + mp_size_t i; + mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD]; + mp_ptr tp = tarr; + mp_limb_t cy; + + /* must fit 2*n limbs in tarr */ + ASSERT (n <= SQR_TOOM2_THRESHOLD); + + if ((n & 1) != 0) + { + if (n == 1) + { + mp_limb_t ul, lpl; + ul = up[0]; + umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS); + rp[0] = lpl >> GMP_NAIL_BITS; + return; + } + + MPN_ZERO (tp, n); + + for (i = 0; i <= n - 2; i += 2) + { + cy = mpn_addmul_2s (tp + 2 * i, up + i + 1, n - (i + 1), up + i); + tp[n + i] = cy; + } + } + else + { + if (n == 2) + { +#if HAVE_NATIVE_mpn_mul_2 + rp[3] = mpn_mul_2 (rp, up, 2, up); +#else + rp[0] = 0; + rp[1] = 0; + rp[3] = mpn_addmul_2 (rp, up, 2, up); +#endif + return; + } + + MPN_ZERO (tp, n); + + for (i = 0; i <= n - 4; i += 2) + { + cy = mpn_addmul_2s (tp + 2 * i, up + i + 1, n - (i + 1), up + i); + tp[n + i] = cy; + } + cy = mpn_addmul_1 (tp + 2 * n - 4, up + n - 1, 1, up[n - 2]); + tp[2 * n - 3] = cy; + } + + MPN_SQR_DIAG_ADDLSH1 (rp, tp, up, n); +} +#define READY_WITH_mpn_sqr_basecase +#endif + + +#if ! defined (READY_WITH_mpn_sqr_basecase) && HAVE_NATIVE_mpn_addmul_2 + +/* mpn_sqr_basecase using plain mpn_addmul_2. + + This is tricky, since we have to let mpn_addmul_2 make some undesirable + multiplies, u[k]*u[k], that we would like to let mpn_sqr_diagonal handle. + This forces us to conditionally add or subtract the mpn_sqr_diagonal + results. Examples of the product we form: + + n = 4 n = 5 n = 6 + u1u0 * u3u2u1 u1u0 * u4u3u2u1 u1u0 * u5u4u3u2u1 + u2 * u3 u3u2 * u4u3 u3u2 * u5u4u3 + u4 * u5 + add: u0 u2 u3 add: u0 u2 u4 add: u0 u2 u4 u5 + sub: u1 sub: u1 u3 sub: u1 u3 +*/ + +void +mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + mp_size_t i; + mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD]; + mp_ptr tp = tarr; + mp_limb_t cy; + + /* must fit 2*n limbs in tarr */ + ASSERT (n <= SQR_TOOM2_THRESHOLD); + + if ((n & 1) != 0) + { + mp_limb_t x0, x1; + + if (n == 1) + { + mp_limb_t ul, lpl; + ul = up[0]; + umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS); + rp[0] = lpl >> GMP_NAIL_BITS; + return; + } + + /* The code below doesn't like unnormalized operands. Since such + operands are unusual, handle them with a dumb recursion. */ + if (up[n - 1] == 0) + { + rp[2 * n - 2] = 0; + rp[2 * n - 1] = 0; + mpn_sqr_basecase (rp, up, n - 1); + return; + } + + MPN_ZERO (tp, n); + + for (i = 0; i <= n - 2; i += 2) + { + cy = mpn_addmul_2 (tp + 2 * i, up + i + 1, n - (i + 1), up + i); + tp[n + i] = cy; + } + + MPN_SQR_DIAGONAL (rp, up, n); + + for (i = 2;; i += 4) + { + x0 = rp[i + 0]; + rp[i + 0] = (-x0) & GMP_NUMB_MASK; + x1 = rp[i + 1]; + rp[i + 1] = (-x1 - (x0 != 0)) & GMP_NUMB_MASK; + __GMPN_SUB_1 (cy, rp + i + 2, rp + i + 2, 2, (x1 | x0) != 0); + if (i + 4 >= 2 * n) + break; + mpn_incr_u (rp + i + 4, cy); + } + } + else + { + mp_limb_t x0, x1; + + if (n == 2) + { +#if HAVE_NATIVE_mpn_mul_2 + rp[3] = mpn_mul_2 (rp, up, 2, up); +#else + rp[0] = 0; + rp[1] = 0; + rp[3] = mpn_addmul_2 (rp, up, 2, up); +#endif + return; + } + + /* The code below doesn't like unnormalized operands. Since such + operands are unusual, handle them with a dumb recursion. */ + if (up[n - 1] == 0) + { + rp[2 * n - 2] = 0; + rp[2 * n - 1] = 0; + mpn_sqr_basecase (rp, up, n - 1); + return; + } + + MPN_ZERO (tp, n); + + for (i = 0; i <= n - 4; i += 2) + { + cy = mpn_addmul_2 (tp + 2 * i, up + i + 1, n - (i + 1), up + i); + tp[n + i] = cy; + } + cy = mpn_addmul_1 (tp + 2 * n - 4, up + n - 1, 1, up[n - 2]); + tp[2 * n - 3] = cy; + + MPN_SQR_DIAGONAL (rp, up, n); + + for (i = 2;; i += 4) + { + x0 = rp[i + 0]; + rp[i + 0] = (-x0) & GMP_NUMB_MASK; + x1 = rp[i + 1]; + rp[i + 1] = (-x1 - (x0 != 0)) & GMP_NUMB_MASK; + if (i + 6 >= 2 * n) + break; + __GMPN_SUB_1 (cy, rp + i + 2, rp + i + 2, 2, (x1 | x0) != 0); + mpn_incr_u (rp + i + 4, cy); + } + mpn_decr_u (rp + i + 2, (x1 | x0) != 0); + } + +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2); +#else + cy = mpn_lshift (tp, tp, 2 * n - 2, 1); + cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2); +#endif + rp[2 * n - 1] += cy; +} +#define READY_WITH_mpn_sqr_basecase +#endif + + +#if ! defined (READY_WITH_mpn_sqr_basecase) && HAVE_NATIVE_mpn_sqr_diag_addlsh1 + +/* mpn_sqr_basecase using mpn_addmul_1 and mpn_sqr_diag_addlsh1, avoiding stack + allocation. */ +void +mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + if (n == 1) + { + mp_limb_t ul, lpl; + ul = up[0]; + umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS); + rp[0] = lpl >> GMP_NAIL_BITS; + } + else + { + mp_size_t i; + mp_ptr xp; + + rp += 1; + rp[n - 1] = mpn_mul_1 (rp, up + 1, n - 1, up[0]); + for (i = n - 2; i != 0; i--) + { + up += 1; + rp += 2; + rp[i] = mpn_addmul_1 (rp, up + 1, i, up[0]); + } + + xp = rp - 2 * n + 3; + mpn_sqr_diag_addlsh1 (xp, xp + 1, up - n + 2, n); + } +} +#define READY_WITH_mpn_sqr_basecase +#endif + + +#if ! defined (READY_WITH_mpn_sqr_basecase) + +/* Default mpn_sqr_basecase using mpn_addmul_1. */ +void +mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + mp_size_t i; + + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (rp, 2*n, up, n)); + + if (n == 1) + { + mp_limb_t ul, lpl; + ul = up[0]; + umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS); + rp[0] = lpl >> GMP_NAIL_BITS; + } + else + { + mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD]; + mp_ptr tp = tarr; + mp_limb_t cy; + + /* must fit 2*n limbs in tarr */ + ASSERT (n <= SQR_TOOM2_THRESHOLD); + + cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]); + tp[n - 1] = cy; + for (i = 2; i < n; i++) + { + mp_limb_t cy; + cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]); + tp[n + i - 2] = cy; + } + + MPN_SQR_DIAG_ADDLSH1 (rp, tp, up, n); + } +} +#define READY_WITH_mpn_sqr_basecase +#endif diff --git a/gmp-6.3.0/mpn/generic/sqrlo.c b/gmp-6.3.0/mpn/generic/sqrlo.c new file mode 100644 index 0000000..71530b6 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sqrlo.c @@ -0,0 +1,239 @@ +/* mpn_sqrlo -- squares an n-limb number and returns the low n limbs + of the result. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS + FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED + THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2004, 2005, 2009, 2010, 2012, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY +#define MAYBE_range_basecase 1 +#define MAYBE_range_toom22 1 +#else +#define MAYBE_range_basecase \ + ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM2_THRESHOLD*36/(36-11)) +#define MAYBE_range_toom22 \ + ((SQRLO_DC_THRESHOLD == 0 ? SQRLO_BASECASE_THRESHOLD : SQRLO_DC_THRESHOLD) < SQR_TOOM3_THRESHOLD*36/(36-11) ) +#endif + +/* THINK: The DC strategy uses different constants in different Toom's + ranges. Something smoother? +*/ + +/* + Compute the least significant half of the product {xy,n}*{yp,n}, or + formally {rp,n} = {xy,n}*{yp,n} Mod (B^n). + + Above the given threshold, the Divide and Conquer strategy is used. + The operand is split in two, and a full square plus a mullo + is used to obtain the final result. The more natural strategy is to + split in two halves, but this is far from optimal when a + sub-quadratic multiplication is used. + + Mulders suggests an unbalanced split in favour of the full product, + split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2. + + To compute the value of a, we assume that the cost of mullo for a + given size ML(n) is a fraction of the cost of a full product with + same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2; + then we can write: + + ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e + + Given a value for e, want to minimise the value of k, i.e. the + function k=(1-a)^e/(1-2*a^e). + + With e=2, the exponent for schoolbook multiplication, the minimum is + given by the values a=1-a=1/2. + + With e=log(3)/log(2), the exponent for Karatsuba (aka toom22), + Mulders compute (1-a) = 0.694... and we approximate a with 11/36. + + Other possible approximations follow: + e=log(5)/log(3) [Toom-3] -> a ~= 9/40 + e=log(7)/log(4) [Toom-4] -> a ~= 7/39 + e=log(11)/log(6) [Toom-6] -> a ~= 1/8 + e=log(15)/log(8) [Toom-8] -> a ~= 1/10 + + The values above where obtained with the following trivial commands + in the gp-pari shell: + +fun(e,a)=(1-a)^e/(1-2*a^e) +mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)= 2); + ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); + ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n)); + + /* Divide-and-conquer */ + + /* We need fractional approximation of the value 0 < a <= 1/2 + giving the minimum in the function k=(1-a)^e/(1-2*a^e). + */ + if (MAYBE_range_basecase && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD*36/(36-11))) + n1 = n >> 1; + else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD*36/(36-11))) + n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */ + else if (BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD*40/(40-9))) + n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */ + else if (BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD*10/9)) + n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */ + /* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */ + else + n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */ + + n2 = n - n1; + + /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0 */ + + /* x0 ^ 2 */ + mpn_sqr (tp, xp, n2); + MPN_COPY (rp, tp, n2); + + /* x1 * x0 * 2^(n2 GMP_NUMB_BITS) */ + if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD)) + mpn_mul_basecase (tp + n, xp + n2, n1, xp, n1); + else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD)) + mpn_mullo_basecase (tp + n, xp + n2, xp, n1); + else + mpn_mullo_n (tp + n, xp + n2, xp, n1); + /* mpn_dc_mullo_n (tp + n, xp + n2, xp, n1, tp + n); */ +#if HAVE_NATIVE_mpn_addlsh1_n + mpn_addlsh1_n (rp + n2, tp + n2, tp + n, n1); +#else + mpn_lshift (rp + n2, tp + n, n1, 1); + mpn_add_n (rp + n2, rp + n2, tp + n2, n1); +#endif +} + +/* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */ +#define SQR_BASECASE_ALLOC \ + (SQRLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*SQRLO_BASECASE_THRESHOLD_LIMIT) + +/* FIXME: This function should accept a temporary area; dc_sqrlo + accepts a pointer tp, and handle the case tp == rp, do the same here. +*/ + +void +mpn_sqrlo (mp_ptr rp, mp_srcptr xp, mp_size_t n) +{ + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (rp, n, xp, n)); + + if (BELOW_THRESHOLD (n, SQRLO_BASECASE_THRESHOLD)) + { + /* FIXME: smarter criteria? */ +#if HAVE_NATIVE_mpn_mullo_basecase || ! HAVE_NATIVE_mpn_sqr_basecase + /* mullo computes as many products as sqr, but directly writes + on the result area. */ + mpn_mullo_basecase (rp, xp, xp, n); +#else + /* Allocate workspace of fixed size on stack: fast! */ + mp_limb_t tp[SQR_BASECASE_ALLOC]; + mpn_sqr_basecase (tp, xp, n); + MPN_COPY (rp, tp, n); +#endif + } + else if (BELOW_THRESHOLD (n, SQRLO_DC_THRESHOLD)) + { + mpn_sqrlo_basecase (rp, xp, n); + } + else + { + mp_ptr tp; + TMP_DECL; + TMP_MARK; + tp = TMP_ALLOC_LIMBS (mpn_sqrlo_itch (n)); + if (BELOW_THRESHOLD (n, SQRLO_SQR_THRESHOLD)) + { + mpn_dc_sqrlo (rp, xp, n, tp); + } + else + { + /* For really large operands, use plain mpn_mul_n but throw away upper n + limbs of result. */ +#if !TUNE_PROGRAM_BUILD && (SQRLO_SQR_THRESHOLD > SQR_FFT_THRESHOLD) + mpn_fft_mul (tp, xp, n, xp, n); +#else + mpn_sqr (tp, xp, n); +#endif + MPN_COPY (rp, tp, n); + } + TMP_FREE; + } +} diff --git a/gmp-6.3.0/mpn/generic/sqrlo_basecase.c b/gmp-6.3.0/mpn/generic/sqrlo_basecase.c new file mode 100644 index 0000000..3148609 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sqrlo_basecase.c @@ -0,0 +1,194 @@ +/* mpn_sqrlo_basecase -- Internal routine to square a natural number + of length n. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + + +Copyright 1991-1994, 1996, 1997, 2000-2005, 2008, 2010, 2011, 2015, +2016 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +#ifndef SQRLO_SHORTCUT_MULTIPLICATIONS +#if HAVE_NATIVE_mpn_addmul_1 +#define SQRLO_SHORTCUT_MULTIPLICATIONS 0 +#else +#define SQRLO_SHORTCUT_MULTIPLICATIONS 1 +#endif +#endif + +#if HAVE_NATIVE_mpn_sqr_diagonal +#define MPN_SQR_DIAGONAL(rp, up, n) \ + mpn_sqr_diagonal (rp, up, n) +#else +#define MPN_SQR_DIAGONAL(rp, up, n) \ + do { \ + mp_size_t _i; \ + for (_i = 0; _i < (n); _i++) \ + { \ + mp_limb_t ul, lpl; \ + ul = (up)[_i]; \ + umul_ppmm ((rp)[2 * _i + 1], lpl, ul, ul << GMP_NAIL_BITS); \ + (rp)[2 * _i] = lpl >> GMP_NAIL_BITS; \ + } \ + } while (0) +#endif + +#define MPN_SQRLO_DIAGONAL(rp, up, n) \ + do { \ + mp_size_t nhalf; \ + nhalf = (n) >> 1; \ + MPN_SQR_DIAGONAL ((rp), (up), nhalf); \ + if (((n) & 1) != 0) \ + { \ + mp_limb_t op; \ + op = (up)[nhalf]; \ + (rp)[(n) - 1] = (op * op) & GMP_NUMB_MASK; \ + } \ + } while (0) + +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 +#define MPN_SQRLO_DIAG_ADDLSH1(rp, tp, up, n) \ + do { \ + MPN_SQRLO_DIAGONAL((rp), (up), (n)); \ + mpn_addlsh1_n_ip1 ((rp) + 1, (tp), (n) - 1); \ + } while (0) +#else +#define MPN_SQRLO_DIAG_ADDLSH1(rp, tp, up, n) \ + do { \ + MPN_SQRLO_DIAGONAL((rp), (up), (n)); \ + mpn_lshift ((tp), (tp), (n) - 1, 1); \ + mpn_add_n ((rp) + 1, (rp) + 1, (tp), (n) - 1); \ + } while (0) +#endif + +/* Avoid zero allocations when SQRLO_LO_THRESHOLD is 0 (this code not used). */ +#define SQRLO_BASECASE_ALLOC \ + (SQRLO_DC_THRESHOLD_LIMIT < 2 ? 1 : SQRLO_DC_THRESHOLD_LIMIT - 1) + +/* Default mpn_sqrlo_basecase using mpn_addmul_1. */ +#ifndef SQRLO_SPECIAL_CASES +#define SQRLO_SPECIAL_CASES 2 +#endif + +#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY +#define MAYBE_special_cases 1 +#else +#define MAYBE_special_cases \ + ((SQRLO_BASECASE_THRESHOLD <= SQRLO_SPECIAL_CASES) && (SQRLO_DC_THRESHOLD != 0)) +#endif + +void +mpn_sqrlo_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n) +{ + mp_limb_t ul; + + ASSERT (n >= 1); + ASSERT (! MPN_OVERLAP_P (rp, n, up, n)); + + ul = up[0]; + + if (MAYBE_special_cases && n <= SQRLO_SPECIAL_CASES) + { +#if SQRLO_SPECIAL_CASES == 1 + rp[0] = (ul * ul) & GMP_NUMB_MASK; +#else + if (n == 1) + rp[0] = (ul * ul) & GMP_NUMB_MASK; + else + { + mp_limb_t hi, lo, ul1; + umul_ppmm (hi, lo, ul, ul << GMP_NAIL_BITS); + rp[0] = lo >> GMP_NAIL_BITS; + ul1 = up[1]; +#if SQRLO_SPECIAL_CASES == 2 + rp[1] = (hi + ul * ul1 * 2) & GMP_NUMB_MASK; +#else + if (n == 2) + rp[1] = (hi + ul * ul1 * 2) & GMP_NUMB_MASK; + else + { + mp_limb_t hi1; +#if GMP_NAIL_BITS != 0 + ul <<= 1; +#endif + umul_ppmm (hi1, lo, ul1 << GMP_NAIL_BITS, ul); + hi1 += ul * up[2]; +#if GMP_NAIL_BITS == 0 + hi1 = (hi1 << 1) | (lo >> (GMP_LIMB_BITS - 1)); + add_ssaaaa(rp[2], rp[1], hi1, lo << 1, ul1 * ul1, hi); +#else + hi += lo >> GMP_NAIL_BITS; + rp[1] = hi & GMP_NUMB_MASK; + rp[2] = (hi1 + ul1 * ul1 + (hi >> GMP_NUMB_BITS)) & GMP_NUMB_MASK; +#endif + } +#endif + } +#endif + } + else + { + mp_limb_t tp[SQRLO_BASECASE_ALLOC]; + mp_size_t i; + + /* must fit n-1 limbs in tp */ + ASSERT (n <= SQRLO_DC_THRESHOLD_LIMIT); + + --n; +#if SQRLO_SHORTCUT_MULTIPLICATIONS + { + mp_limb_t cy; + + cy = ul * up[n] + mpn_mul_1 (tp, up + 1, n - 1, ul); + for (i = 1; 2 * i + 1 < n; ++i) + { + ul = up[i]; + cy += ul * up[n - i] + mpn_addmul_1 (tp + 2 * i, up + i + 1, n - 2 * i - 1, ul); + } + tp [n-1] = (cy + ((n & 1)?up[i] * up[i + 1]:0)) & GMP_NUMB_MASK; + } +#else + mpn_mul_1 (tp, up + 1, n, ul); + for (i = 1; 2 * i < n; ++i) + mpn_addmul_1 (tp + 2 * i, up + i + 1, n - 2 * i, up[i]); +#endif + + MPN_SQRLO_DIAG_ADDLSH1 (rp, tp, up, n + 1); + } +} +#undef SQRLO_SPECIAL_CASES +#undef MAYBE_special_cases +#undef SQRLO_BASECASE_ALLOC +#undef SQRLO_SHORTCUT_MULTIPLICATIONS +#undef MPN_SQR_DIAGONAL +#undef MPN_SQRLO_DIAGONAL +#undef MPN_SQRLO_DIAG_ADDLSH1 diff --git a/gmp-6.3.0/mpn/generic/sqrmod_bnm1.c b/gmp-6.3.0/mpn/generic/sqrmod_bnm1.c new file mode 100644 index 0000000..0acbe12 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sqrmod_bnm1.c @@ -0,0 +1,328 @@ +/* sqrmod_bnm1.c -- squaring mod B^n-1. + + Contributed to the GNU project by Niels Möller, Torbjorn Granlund and + Marco Bodrato. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY + SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2010, 2012, 2020, 2022 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" +#include "longlong.h" + +/* Input is {ap,rn}; output is {rp,rn}, computation is + mod B^rn - 1, and values are semi-normalised; zero is represented + as either 0 or B^n - 1. Needs a scratch of 2rn limbs at tp. + tp==rp is allowed. */ +static void +mpn_bc_sqrmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp) +{ + mp_limb_t cy; + + ASSERT (0 < rn); + + mpn_sqr (tp, ap, rn); + cy = mpn_add_n (rp, tp, tp + rn, rn); + /* If cy == 1, then the value of rp is at most B^rn - 2, so there can + * be no overflow when adding in the carry. */ + MPN_INCR_U (rp, rn, cy); +} + + +/* Input is {ap,rn+1}; output is {rp,rn+1}, in + normalised representation, computation is mod B^rn + 1. Needs + a scratch area of 2rn limbs at tp; tp == rp is allowed. + Output is normalised. */ +static void +mpn_bc_sqrmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_size_t rn, mp_ptr tp) +{ + mp_limb_t cy; + unsigned k; + + ASSERT (0 < rn); + + if (UNLIKELY (ap[rn])) + { + *rp = 1; + MPN_FILL (rp + 1, rn, 0); + return; + } + else if (MPN_SQRMOD_BKNP1_USABLE (rn, k, MUL_FFT_MODF_THRESHOLD)) + { + mp_size_t n_k = rn / k; + TMP_DECL; + + TMP_MARK; + mpn_sqrmod_bknp1 (rp, ap, n_k, k, + TMP_ALLOC_LIMBS (mpn_sqrmod_bknp1_itch (rn))); + TMP_FREE; + return; + } + mpn_sqr (tp, ap, rn); + cy = mpn_sub_n (rp, tp, tp + rn, rn); + rp[rn] = 0; + MPN_INCR_U (rp, rn + 1, cy); +} + + +/* Computes {rp,MIN(rn,2an)} <- {ap,an}^2 Mod(B^rn-1) + * + * The result is expected to be ZERO if and only if the operand + * already is. Otherwise the class [0] Mod(B^rn-1) is represented by + * B^rn-1. + * It should not be a problem if sqrmod_bnm1 is used to + * compute the full square with an <= 2*rn, because this condition + * implies (B^an-1)^2 < (B^rn-1) . + * + * Requires rn/4 < an <= rn + * Scratch need: rn/2 + (need for recursive call OR rn + 3). This gives + * + * S(n) <= rn/2 + MAX (rn + 4, S(n/2)) <= 3/2 rn + 4 + */ +void +mpn_sqrmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_ptr tp) +{ + ASSERT (0 < an); + ASSERT (an <= rn); + + if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, SQRMOD_BNM1_THRESHOLD)) + { + if (UNLIKELY (an < rn)) + { + if (UNLIKELY (2*an <= rn)) + { + mpn_sqr (rp, ap, an); + } + else + { + mp_limb_t cy; + mpn_sqr (tp, ap, an); + cy = mpn_add (rp, tp, rn, tp + rn, 2*an - rn); + MPN_INCR_U (rp, rn, cy); + } + } + else + mpn_bc_sqrmod_bnm1 (rp, ap, rn, tp); + } + else + { + mp_size_t n; + mp_limb_t cy; + mp_limb_t hi; + + n = rn >> 1; + + ASSERT (2*an > n); + + /* Compute xm = a^2 mod (B^n - 1), xp = a^2 mod (B^n + 1) + and crt together as + + x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)] + */ + +#define a0 ap +#define a1 (ap + n) + +#define xp tp /* 2n + 2 */ + /* am1 maybe in {xp, n} */ +#define sp1 (tp + 2*n + 2) + /* ap1 maybe in {sp1, n + 1} */ + + { + mp_srcptr am1; + mp_size_t anm; + mp_ptr so; + + if (LIKELY (an > n)) + { + so = xp + n; + am1 = xp; + cy = mpn_add (xp, a0, n, a1, an - n); + MPN_INCR_U (xp, n, cy); + anm = n; + } + else + { + so = xp; + am1 = a0; + anm = an; + } + + mpn_sqrmod_bnm1 (rp, n, am1, anm, so); + } + + { + int k; + mp_srcptr ap1; + mp_size_t anp; + + if (LIKELY (an > n)) { + ap1 = sp1; + cy = mpn_sub (sp1, a0, n, a1, an - n); + sp1[n] = 0; + MPN_INCR_U (sp1, n + 1, cy); + anp = n + ap1[n]; + } else { + ap1 = a0; + anp = an; + } + + if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD)) + k=0; + else + { + int mask; + k = mpn_fft_best_k (n, 1); + mask = (1<>=1;}; + } + if (k >= FFT_FIRST_K) + xp[n] = mpn_mul_fft (xp, n, ap1, anp, ap1, anp, k); + else if (UNLIKELY (ap1 == a0)) + { + ASSERT (anp <= n); + ASSERT (2*anp > n); + mpn_sqr (xp, a0, an); + anp = 2*an - n; + cy = mpn_sub (xp, xp, n, xp + n, anp); + xp[n] = 0; + MPN_INCR_U (xp, n+1, cy); + } + else + mpn_bc_sqrmod_bnp1 (xp, ap1, n, xp); + } + + /* Here the CRT recomposition begins. + + xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1) + Division by 2 is a bitwise rotation. + + Assumes xp normalised mod (B^n+1). + + The residue class [0] is represented by [B^n-1]; except when + both input are ZERO. + */ + +#if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc +#if HAVE_NATIVE_mpn_rsh1add_nc + cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */ + hi = cy << (GMP_NUMB_BITS - 1); + cy = 0; + /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi + overflows, i.e. a further increment will not overflow again. */ +#else /* ! _nc */ + cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */ + hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */ + cy >>= 1; + /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that + the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */ +#endif +#if GMP_NAIL_BITS == 0 + add_ssaaaa(cy, rp[n-1], cy, rp[n-1], CNST_LIMB(0), hi); +#else + cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1); + rp[n-1] ^= hi; +#endif +#else /* ! HAVE_NATIVE_mpn_rsh1add_n */ +#if HAVE_NATIVE_mpn_add_nc + cy = mpn_add_nc(rp, rp, xp, n, xp[n]); +#else /* ! _nc */ + cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */ +#endif + cy += (rp[0]&1); + mpn_rshift(rp, rp, n, 1); + ASSERT (cy <= 2); + hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */ + cy >>= 1; + /* We can have cy != 0 only if hi = 0... */ + ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0); + rp[n-1] |= hi; + /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */ +#endif + ASSERT (cy <= 1); + /* Next increment can not overflow, read the previous comments about cy. */ + ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0)); + MPN_INCR_U(rp, n, cy); + + /* Compute the highest half: + ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n + */ + if (UNLIKELY (2*an < rn)) + { + /* Note that in this case, the only way the result can equal + zero mod B^{rn} - 1 is if the input is zero, and + then the output of both the recursive calls and this CRT + reconstruction is zero, not B^{rn} - 1. */ + cy = mpn_sub_n (rp + n, rp, xp, 2*an - n); + + /* FIXME: This subtraction of the high parts is not really + necessary, we do it to get the carry out, and for sanity + checking. */ + cy = xp[n] + mpn_sub_nc (xp + 2*an - n, rp + 2*an - n, + xp + 2*an - n, rn - 2*an, cy); + ASSERT (mpn_zero_p (xp + 2*an - n+1, rn - 1 - 2*an)); + cy = mpn_sub_1 (rp, rp, 2*an, cy); + ASSERT (cy == (xp + 2*an - n)[0]); + } + else + { + cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n); + /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO. + DECR will affect _at most_ the lowest n limbs. */ + MPN_DECR_U (rp, 2*n, cy); + } +#undef a0 +#undef a1 +#undef xp +#undef sp1 + } +} + +mp_size_t +mpn_sqrmod_bnm1_next_size (mp_size_t n) +{ + mp_size_t nh; + + if (BELOW_THRESHOLD (n, SQRMOD_BNM1_THRESHOLD)) + return n; + if (BELOW_THRESHOLD (n, 4 * (SQRMOD_BNM1_THRESHOLD - 1) + 1)) + return (n + (2-1)) & (-2); + if (BELOW_THRESHOLD (n, 8 * (SQRMOD_BNM1_THRESHOLD - 1) + 1)) + return (n + (4-1)) & (-4); + + nh = (n + 1) >> 1; + + if (BELOW_THRESHOLD (nh, SQR_FFT_MODF_THRESHOLD)) + return (n + (8-1)) & (-8); + + return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 1)); +} diff --git a/gmp-6.3.0/mpn/generic/sqrtrem.c b/gmp-6.3.0/mpn/generic/sqrtrem.c new file mode 100644 index 0000000..cc6dd9c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sqrtrem.c @@ -0,0 +1,555 @@ +/* mpn_sqrtrem -- square root and remainder + + Contributed to the GNU project by Paul Zimmermann (most code), + Torbjorn Granlund (mpn_sqrtrem1) and Marco Bodrato (mpn_dc_sqrt). + + THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH MUTABLE + INTERFACES. IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. + IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A + FUTURE GMP RELEASE. + +Copyright 1999-2002, 2004, 2005, 2008, 2010, 2012, 2015, 2017 Free Software +Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +/* See "Karatsuba Square Root", reference in gmp.texi. */ + + +#include +#include + +#include "gmp-impl.h" +#include "longlong.h" +#define USE_DIVAPPR_Q 1 +#define TRACE(x) + +static const unsigned char invsqrttab[384] = /* The common 0x100 was removed */ +{ + 0xff,0xfd,0xfb,0xf9,0xf7,0xf5,0xf3,0xf2, /* sqrt(1/80)..sqrt(1/87) */ + 0xf0,0xee,0xec,0xea,0xe9,0xe7,0xe5,0xe4, /* sqrt(1/88)..sqrt(1/8f) */ + 0xe2,0xe0,0xdf,0xdd,0xdb,0xda,0xd8,0xd7, /* sqrt(1/90)..sqrt(1/97) */ + 0xd5,0xd4,0xd2,0xd1,0xcf,0xce,0xcc,0xcb, /* sqrt(1/98)..sqrt(1/9f) */ + 0xc9,0xc8,0xc6,0xc5,0xc4,0xc2,0xc1,0xc0, /* sqrt(1/a0)..sqrt(1/a7) */ + 0xbe,0xbd,0xbc,0xba,0xb9,0xb8,0xb7,0xb5, /* sqrt(1/a8)..sqrt(1/af) */ + 0xb4,0xb3,0xb2,0xb0,0xaf,0xae,0xad,0xac, /* sqrt(1/b0)..sqrt(1/b7) */ + 0xaa,0xa9,0xa8,0xa7,0xa6,0xa5,0xa4,0xa3, /* sqrt(1/b8)..sqrt(1/bf) */ + 0xa2,0xa0,0x9f,0x9e,0x9d,0x9c,0x9b,0x9a, /* sqrt(1/c0)..sqrt(1/c7) */ + 0x99,0x98,0x97,0x96,0x95,0x94,0x93,0x92, /* sqrt(1/c8)..sqrt(1/cf) */ + 0x91,0x90,0x8f,0x8e,0x8d,0x8c,0x8c,0x8b, /* sqrt(1/d0)..sqrt(1/d7) */ + 0x8a,0x89,0x88,0x87,0x86,0x85,0x84,0x83, /* sqrt(1/d8)..sqrt(1/df) */ + 0x83,0x82,0x81,0x80,0x7f,0x7e,0x7e,0x7d, /* sqrt(1/e0)..sqrt(1/e7) */ + 0x7c,0x7b,0x7a,0x79,0x79,0x78,0x77,0x76, /* sqrt(1/e8)..sqrt(1/ef) */ + 0x76,0x75,0x74,0x73,0x72,0x72,0x71,0x70, /* sqrt(1/f0)..sqrt(1/f7) */ + 0x6f,0x6f,0x6e,0x6d,0x6d,0x6c,0x6b,0x6a, /* sqrt(1/f8)..sqrt(1/ff) */ + 0x6a,0x69,0x68,0x68,0x67,0x66,0x66,0x65, /* sqrt(1/100)..sqrt(1/107) */ + 0x64,0x64,0x63,0x62,0x62,0x61,0x60,0x60, /* sqrt(1/108)..sqrt(1/10f) */ + 0x5f,0x5e,0x5e,0x5d,0x5c,0x5c,0x5b,0x5a, /* sqrt(1/110)..sqrt(1/117) */ + 0x5a,0x59,0x59,0x58,0x57,0x57,0x56,0x56, /* sqrt(1/118)..sqrt(1/11f) */ + 0x55,0x54,0x54,0x53,0x53,0x52,0x52,0x51, /* sqrt(1/120)..sqrt(1/127) */ + 0x50,0x50,0x4f,0x4f,0x4e,0x4e,0x4d,0x4d, /* sqrt(1/128)..sqrt(1/12f) */ + 0x4c,0x4b,0x4b,0x4a,0x4a,0x49,0x49,0x48, /* sqrt(1/130)..sqrt(1/137) */ + 0x48,0x47,0x47,0x46,0x46,0x45,0x45,0x44, /* sqrt(1/138)..sqrt(1/13f) */ + 0x44,0x43,0x43,0x42,0x42,0x41,0x41,0x40, /* sqrt(1/140)..sqrt(1/147) */ + 0x40,0x3f,0x3f,0x3e,0x3e,0x3d,0x3d,0x3c, /* sqrt(1/148)..sqrt(1/14f) */ + 0x3c,0x3b,0x3b,0x3a,0x3a,0x39,0x39,0x39, /* sqrt(1/150)..sqrt(1/157) */ + 0x38,0x38,0x37,0x37,0x36,0x36,0x35,0x35, /* sqrt(1/158)..sqrt(1/15f) */ + 0x35,0x34,0x34,0x33,0x33,0x32,0x32,0x32, /* sqrt(1/160)..sqrt(1/167) */ + 0x31,0x31,0x30,0x30,0x2f,0x2f,0x2f,0x2e, /* sqrt(1/168)..sqrt(1/16f) */ + 0x2e,0x2d,0x2d,0x2d,0x2c,0x2c,0x2b,0x2b, /* sqrt(1/170)..sqrt(1/177) */ + 0x2b,0x2a,0x2a,0x29,0x29,0x29,0x28,0x28, /* sqrt(1/178)..sqrt(1/17f) */ + 0x27,0x27,0x27,0x26,0x26,0x26,0x25,0x25, /* sqrt(1/180)..sqrt(1/187) */ + 0x24,0x24,0x24,0x23,0x23,0x23,0x22,0x22, /* sqrt(1/188)..sqrt(1/18f) */ + 0x21,0x21,0x21,0x20,0x20,0x20,0x1f,0x1f, /* sqrt(1/190)..sqrt(1/197) */ + 0x1f,0x1e,0x1e,0x1e,0x1d,0x1d,0x1d,0x1c, /* sqrt(1/198)..sqrt(1/19f) */ + 0x1c,0x1b,0x1b,0x1b,0x1a,0x1a,0x1a,0x19, /* sqrt(1/1a0)..sqrt(1/1a7) */ + 0x19,0x19,0x18,0x18,0x18,0x18,0x17,0x17, /* sqrt(1/1a8)..sqrt(1/1af) */ + 0x17,0x16,0x16,0x16,0x15,0x15,0x15,0x14, /* sqrt(1/1b0)..sqrt(1/1b7) */ + 0x14,0x14,0x13,0x13,0x13,0x12,0x12,0x12, /* sqrt(1/1b8)..sqrt(1/1bf) */ + 0x12,0x11,0x11,0x11,0x10,0x10,0x10,0x0f, /* sqrt(1/1c0)..sqrt(1/1c7) */ + 0x0f,0x0f,0x0f,0x0e,0x0e,0x0e,0x0d,0x0d, /* sqrt(1/1c8)..sqrt(1/1cf) */ + 0x0d,0x0c,0x0c,0x0c,0x0c,0x0b,0x0b,0x0b, /* sqrt(1/1d0)..sqrt(1/1d7) */ + 0x0a,0x0a,0x0a,0x0a,0x09,0x09,0x09,0x09, /* sqrt(1/1d8)..sqrt(1/1df) */ + 0x08,0x08,0x08,0x07,0x07,0x07,0x07,0x06, /* sqrt(1/1e0)..sqrt(1/1e7) */ + 0x06,0x06,0x06,0x05,0x05,0x05,0x04,0x04, /* sqrt(1/1e8)..sqrt(1/1ef) */ + 0x04,0x04,0x03,0x03,0x03,0x03,0x02,0x02, /* sqrt(1/1f0)..sqrt(1/1f7) */ + 0x02,0x02,0x01,0x01,0x01,0x01,0x00,0x00 /* sqrt(1/1f8)..sqrt(1/1ff) */ +}; + +/* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2. */ + +#if GMP_NUMB_BITS > 32 +#define MAGIC CNST_LIMB(0x10000000000) /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */ +#else +#define MAGIC CNST_LIMB(0x100000) /* 0xfee6f < MAGIC < 0x29cbc8 */ +#endif + +static mp_limb_t +mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0) +{ +#if GMP_NUMB_BITS > 32 + mp_limb_t a1; +#endif + mp_limb_t x0, t2, t, x2; + unsigned abits; + + ASSERT_ALWAYS (GMP_NAIL_BITS == 0); + ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64); + ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2); + + /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a), + since we can do the former without division. As part of the last + iteration convert from 1/sqrt(a) to sqrt(a). */ + + abits = a0 >> (GMP_LIMB_BITS - 1 - 8); /* extract bits for table lookup */ + x0 = 0x100 | invsqrttab[abits - 0x80]; /* initial 1/sqrt(a) */ + + /* x0 is now an 8 bits approximation of 1/sqrt(a0) */ + +#if GMP_NUMB_BITS > 32 + a1 = a0 >> (GMP_LIMB_BITS - 1 - 32); + t = (mp_limb_signed_t) (CNST_LIMB(0x2000000000000) - 0x30000 - a1 * x0 * x0) >> 16; + x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2)); + + /* x0 is now a 16 bits approximation of 1/sqrt(a0) */ + + t2 = x0 * (a0 >> (32-8)); + t = t2 >> 25; + t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8)); + x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15); + x0 >>= 32; +#else + t2 = x0 * (a0 >> (16-8)); + t = t2 >> 13; + t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8)); + x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7); + x0 >>= 16; +#endif + + /* x0 is now a full limb approximation of sqrt(a0) */ + + x2 = x0 * x0; + if (x2 + 2*x0 <= a0 - 1) + { + x2 += 2*x0 + 1; + x0++; + } + + *rp = a0 - x2; + return x0; +} + + +#define Prec (GMP_NUMB_BITS >> 1) +#if ! defined(SQRTREM2_INPLACE) +#define SQRTREM2_INPLACE 0 +#endif + +/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized + return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */ +#if SQRTREM2_INPLACE +#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp) +static mp_limb_t +mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp) +{ + mp_srcptr np = rp; +#else +#define CALL_SQRTREM2_INPLACE(sp,rp) mpn_sqrtrem2 (sp, rp, rp) +static mp_limb_t +mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np) +{ +#endif + mp_limb_t q, u, np0, sp0, rp0, q2; + int cc; + + ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2); + + np0 = np[0]; + sp0 = mpn_sqrtrem1 (rp, np[1]); + rp0 = rp[0]; + /* rp0 <= 2*sp0 < 2^(Prec + 1) */ + rp0 = (rp0 << (Prec - 1)) + (np0 >> (Prec + 1)); + q = rp0 / sp0; + /* q <= 2^Prec, if q = 2^Prec, reduce the overestimate. */ + q -= q >> Prec; + /* now we have q < 2^Prec */ + u = rp0 - q * sp0; + /* now we have (rp[0]<>Prec)/2 = q * sp0 + u */ + sp0 = (sp0 << Prec) | q; + cc = u >> (Prec - 1); + rp0 = ((u << (Prec + 1)) & GMP_NUMB_MASK) + (np0 & ((CNST_LIMB (1) << (Prec + 1)) - 1)); + /* subtract q * q from rp */ + q2 = q * q; + cc -= rp0 < q2; + rp0 -= q2; + if (cc < 0) + { + rp0 += sp0; + cc += rp0 < sp0; + --sp0; + rp0 += sp0; + cc += rp0 < sp0; + } + + rp[0] = rp0; + sp[0] = sp0; + return cc; +} + +/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n}, + and in {np, n} the low n limbs of the remainder, returns the high + limb of the remainder (which is 0 or 1). + Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4 + where B=2^GMP_NUMB_BITS. + Needs a scratch of n/2+1 limbs. */ +static mp_limb_t +mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n, mp_limb_t approx, mp_ptr scratch) +{ + mp_limb_t q; /* carry out of {sp, n} */ + int c, b; /* carry out of remainder */ + mp_size_t l, h; + + ASSERT (n > 1); + ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2); + + l = n / 2; + h = n - l; + if (h == 1) + q = CALL_SQRTREM2_INPLACE (sp + l, np + 2 * l); + else + q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h, 0, scratch); + if (q != 0) + ASSERT_CARRY (mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h)); + TRACE(printf("tdiv_qr(,,,,%u,,%u) -> %u\n", (unsigned) n, (unsigned) h, (unsigned) (n - h + 1))); + mpn_tdiv_qr (scratch, np + l, 0, np + l, n, sp + l, h); + q += scratch[l]; + c = scratch[0] & 1; + mpn_rshift (sp, scratch, l, 1); + sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK; + if (UNLIKELY ((sp[0] & approx) != 0)) /* (sp[0] & mask) > 1 */ + return 1; /* Remainder is non-zero */ + q >>= 1; + if (c != 0) + c = mpn_add_n (np + l, np + l, sp + l, h); + TRACE(printf("sqr(,,%u)\n", (unsigned) l)); + mpn_sqr (np + n, sp, l); + b = q + mpn_sub_n (np, np, np + n, 2 * l); + c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b); + + if (c < 0) + { + q = mpn_add_1 (sp + l, sp + l, h, q); +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n + c += mpn_addlsh1_n_ip1 (np, sp, n) + 2 * q; +#else + c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q; +#endif + c -= mpn_sub_1 (np, np, n, CNST_LIMB(1)); + q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1)); + } + + return c; +} + +#if USE_DIVAPPR_Q +static void +mpn_divappr_q (mp_ptr qp, mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn, mp_ptr scratch) +{ + gmp_pi1_t inv; + mp_limb_t qh; + ASSERT (dn > 2); + ASSERT (nn >= dn); + ASSERT ((dp[dn-1] & GMP_NUMB_HIGHBIT) != 0); + + MPN_COPY (scratch, np, nn); + invert_pi1 (inv, dp[dn-1], dp[dn-2]); + if (BELOW_THRESHOLD (dn, DC_DIVAPPR_Q_THRESHOLD)) + qh = mpn_sbpi1_divappr_q (qp, scratch, nn, dp, dn, inv.inv32); + else if (BELOW_THRESHOLD (dn, MU_DIVAPPR_Q_THRESHOLD)) + qh = mpn_dcpi1_divappr_q (qp, scratch, nn, dp, dn, &inv); + else + { + mp_size_t itch = mpn_mu_divappr_q_itch (nn, dn, 0); + TMP_DECL; + TMP_MARK; + /* Sadly, scratch is too small. */ + qh = mpn_mu_divappr_q (qp, np, nn, dp, dn, TMP_ALLOC_LIMBS (itch)); + TMP_FREE; + } + qp [nn - dn] = qh; +} +#endif + +/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n-odd}, + returns zero if the operand was a perfect square, one otherwise. + Assumes {np, 2n-odd}*4^nsh is normalized, i.e. B > np[2n-1-odd]*4^nsh >= B/4 + where B=2^GMP_NUMB_BITS. + THINK: In the odd case, three more (dummy) limbs are taken into account, + when nsh is maximal, two limbs are discarded from the result of the + division. Too much? Is a single dummy limb enough? */ +static int +mpn_dc_sqrt (mp_ptr sp, mp_srcptr np, mp_size_t n, unsigned nsh, unsigned odd) +{ + mp_limb_t q; /* carry out of {sp, n} */ + int c; /* carry out of remainder */ + mp_size_t l, h; + mp_ptr qp, tp, scratch; + TMP_DECL; + TMP_MARK; + + ASSERT (np[2 * n - 1 - odd] != 0); + ASSERT (n > 4); + ASSERT (nsh < GMP_NUMB_BITS / 2); + + l = (n - 1) / 2; + h = n - l; + ASSERT (n >= l + 2 && l + 2 >= h && h > l && l >= 1 + odd); + scratch = TMP_ALLOC_LIMBS (l + 2 * n + 5 - USE_DIVAPPR_Q); /* n + 2-USE_DIVAPPR_Q */ + tp = scratch + n + 2 - USE_DIVAPPR_Q; /* n + h + 1, but tp [-1] is writable */ + if (nsh != 0) + { + /* o is used to exactly set the lowest bits of the dividend, is it needed? */ + int o = l > (1 + odd); + ASSERT_NOCARRY (mpn_lshift (tp - o, np + l - 1 - o - odd, n + h + 1 + o, 2 * nsh)); + } + else + MPN_COPY (tp, np + l - 1 - odd, n + h + 1); + q = mpn_dc_sqrtrem (sp + l, tp + l + 1, h, 0, scratch); + if (q != 0) + ASSERT_CARRY (mpn_sub_n (tp + l + 1, tp + l + 1, sp + l, h)); + qp = tp + n + 1; /* l + 2 */ + TRACE(printf("div(appr)_q(,,%u,,%u) -> %u \n", (unsigned) n+1, (unsigned) h, (unsigned) (n + 1 - h + 1))); +#if USE_DIVAPPR_Q + mpn_divappr_q (qp, tp, n + 1, sp + l, h, scratch); +#else + mpn_div_q (qp, tp, n + 1, sp + l, h, scratch); +#endif + q += qp [l + 1]; + c = 1; + if (q > 1) + { + /* FIXME: if s!=0 we will shift later, a noop on this area. */ + MPN_FILL (sp, l, GMP_NUMB_MAX); + } + else + { + /* FIXME: if s!=0 we will shift again later, shift just once. */ + mpn_rshift (sp, qp + 1, l, 1); + sp[l - 1] |= q << (GMP_NUMB_BITS - 1); + if (((qp[0] >> (2 + USE_DIVAPPR_Q)) | /* < 3 + 4*USE_DIVAPPR_Q */ + (qp[1] & (GMP_NUMB_MASK >> ((GMP_NUMB_BITS >> odd)- nsh - 1)))) == 0) + { + mp_limb_t cy; + /* Approximation is not good enough, the extra limb(+ nsh bits) + is smaller than needed to absorb the possible error. */ + /* {qp + 1, l + 1} equals 2*{sp, l} */ + /* FIXME: use mullo or wrap-around, or directly evaluate + remainder with a single sqrmod_bnm1. */ + TRACE(printf("mul(,,%u,,%u)\n", (unsigned) h, (unsigned) (l+1))); + ASSERT_NOCARRY (mpn_mul (scratch, sp + l, h, qp + 1, l + 1)); + /* Compute the remainder of the previous mpn_div(appr)_q. */ + cy = mpn_sub_n (tp + 1, tp + 1, scratch, h); +#if USE_DIVAPPR_Q || WANT_ASSERT + MPN_DECR_U (tp + 1 + h, l, cy); +#if USE_DIVAPPR_Q + ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) <= 0); + if (mpn_cmp (tp + 1 + h, scratch + h, l) < 0) + { + /* May happen only if div result was not exact. */ +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 || HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n_ip1 (tp + 1, sp + l, h); +#else + cy = mpn_addmul_1 (tp + 1, sp + l, h, CNST_LIMB(2)); +#endif + ASSERT_NOCARRY (mpn_add_1 (tp + 1 + h, tp + 1 + h, l, cy)); + MPN_DECR_U (sp, l, 1); + } + /* Can the root be exact when a correction was needed? We + did not find an example, but it depends on divappr + internals, and we can not assume it true in general...*/ + /* else */ +#else /* WANT_ASSERT */ + ASSERT (mpn_cmp (tp + 1 + h, scratch + h, l) == 0); +#endif +#endif + if (mpn_zero_p (tp + l + 1, h - l)) + { + TRACE(printf("sqr(,,%u)\n", (unsigned) l)); + mpn_sqr (scratch, sp, l); + c = mpn_cmp (tp + 1, scratch + l, l); + if (c == 0) + { + if (nsh != 0) + { + mpn_lshift (tp, np, l, 2 * nsh); + np = tp; + } + c = mpn_cmp (np, scratch + odd, l - odd); + } + if (c < 0) + { + MPN_DECR_U (sp, l, 1); + c = 1; + } + } + } + } + TMP_FREE; + + if ((odd | nsh) != 0) + mpn_rshift (sp, sp, n, nsh + (odd ? GMP_NUMB_BITS / 2 : 0)); + return c; +} + + +mp_size_t +mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn) +{ + mp_limb_t cc, high, rl; + int c; + mp_size_t rn, tn; + TMP_DECL; + + ASSERT (nn > 0); + ASSERT_MPN (np, nn); + + ASSERT (np[nn - 1] != 0); + ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn)); + ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn)); + ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn)); + + high = np[nn - 1]; + if (high & (GMP_NUMB_HIGHBIT | (GMP_NUMB_HIGHBIT / 2))) + c = 0; + else + { + count_leading_zeros (c, high); + c -= GMP_NAIL_BITS; + + c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */ + } + if (nn == 1) { + if (c == 0) + { + sp[0] = mpn_sqrtrem1 (&rl, high); + if (rp != NULL) + rp[0] = rl; + } + else + { + cc = mpn_sqrtrem1 (&rl, high << (2*c)) >> c; + sp[0] = cc; + if (rp != NULL) + rp[0] = rl = high - cc*cc; + } + return rl != 0; + } + if (nn == 2) { + mp_limb_t tp [2]; + if (rp == NULL) rp = tp; + if (c == 0) + { +#if SQRTREM2_INPLACE + rp[1] = high; + rp[0] = np[0]; + cc = CALL_SQRTREM2_INPLACE (sp, rp); +#else + cc = mpn_sqrtrem2 (sp, rp, np); +#endif + rp[1] = cc; + return ((rp[0] | cc) != 0) + cc; + } + else + { + rl = np[0]; + rp[1] = (high << (2*c)) | (rl >> (GMP_NUMB_BITS - 2*c)); + rp[0] = rl << (2*c); + CALL_SQRTREM2_INPLACE (sp, rp); + cc = sp[0] >>= c; /* c != 0, the highest bit of the root cc is 0. */ + rp[0] = rl -= cc*cc; /* Computed modulo 2^GMP_LIMB_BITS, because it's smaller. */ + return rl != 0; + } + } + tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */ + + if ((rp == NULL) && (nn > 8)) + return mpn_dc_sqrt (sp, np, tn, c, nn & 1); + TMP_MARK; + if (((nn & 1) | c) != 0) + { + mp_limb_t s0[1], mask; + mp_ptr tp, scratch; + TMP_ALLOC_LIMBS_2 (tp, 2 * tn, scratch, tn / 2 + 1); + tp[0] = 0; /* needed only when 2*tn > nn, but saves a test */ + if (c != 0) + mpn_lshift (tp + (nn & 1), np, nn, 2 * c); + else + MPN_COPY (tp + (nn & 1), np, nn); + c += (nn & 1) ? GMP_NUMB_BITS / 2 : 0; /* c now represents k */ + mask = (CNST_LIMB (1) << c) - 1; + rl = mpn_dc_sqrtrem (sp, tp, tn, (rp == NULL) ? mask - 1 : 0, scratch); + /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2, + thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */ + s0[0] = sp[0] & mask; /* S mod 2^k */ + rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]); /* R = R + 2*s0*S */ + cc = mpn_submul_1 (tp, s0, 1, s0[0]); + rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc; + mpn_rshift (sp, sp, tn, c); + tp[tn] = rl; + if (rp == NULL) + rp = tp; + c = c << 1; + if (c < GMP_NUMB_BITS) + tn++; + else + { + tp++; + c -= GMP_NUMB_BITS; + } + if (c != 0) + mpn_rshift (rp, tp, tn, c); + else + MPN_COPY_INCR (rp, tp, tn); + rn = tn; + } + else + { + if (rp != np) + { + if (rp == NULL) /* nn <= 8 */ + rp = TMP_SALLOC_LIMBS (nn); + MPN_COPY (rp, np, nn); + } + rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn, 0, TMP_ALLOC_LIMBS(tn / 2 + 1))); + } + + MPN_NORMALIZE (rp, rn); + + TMP_FREE; + return rn; +} diff --git a/gmp-6.3.0/mpn/generic/strongfibo.c b/gmp-6.3.0/mpn/generic/strongfibo.c new file mode 100644 index 0000000..7e8d612 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/strongfibo.c @@ -0,0 +1,219 @@ +/* mpn_fib2m -- calculate Fibonacci numbers, modulo m. + +Contributed to the GNU project by Marco Bodrato. + + THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST + CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN + FUTURE GNU MP RELEASES. + +Copyright 2001, 2002, 2005, 2009, 2018, 2022 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include +#include "gmp-impl.h" + + +#if ! HAVE_NATIVE_mpn_rsblsh1_n && ! HAVE_NATIVE_mpn_sublsh1_n +/* Stores |{ap,n}-{bp,n}| in {rp,n}, + returns the sign of {ap,n}-{bp,n}. */ +static int +abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) +{ + mp_limb_t x, y; + while (--n >= 0) + { + x = ap[n]; + y = bp[n]; + if (x != y) + { + ++n; + if (x > y) + { + ASSERT_NOCARRY (mpn_sub_n (rp, ap, bp, n)); + return 1; + } + else + { + ASSERT_NOCARRY (mpn_sub_n (rp, bp, ap, n)); + return -1; + } + } + rp[n] = 0; + } + return 0; +} +#endif + +/* Computes at most count terms of the sequence needed by the + Lucas-Lehmer-Riesel test, indexing backward: + L_i = L_{i+1}^2 - 2 + + The sequence is computed modulo M = {mp, mn}. + The starting point is given in L_{count+1} = {lp, mn}. + The scratch pointed by sp, needs a space of at least 3 * mn + 1 limbs. + + Returns the index i>0 if L_i = 0 (mod M) is found within the + computed count terms of the sequence. Otherwise it returns zero. + + Note: (+/-2)^2-2=2, (+/-1)^2-2=-1, 0^2-2=-2 + */ + +static mp_bitcnt_t +mpn_llriter (mp_ptr lp, mp_srcptr mp, mp_size_t mn, mp_bitcnt_t count, mp_ptr sp) +{ + do + { + mpn_sqr (sp, lp, mn); + mpn_tdiv_qr (sp + 2 * mn, lp, 0, sp, 2 * mn, mp, mn); + if (lp[0] < 5) + { + /* If L^2 % M < 5, |L^2 % M - 2| <= 2 */ + if (mn == 1 || mpn_zero_p (lp + 1, mn - 1)) + return (lp[0] == 2) ? count : 0; + else + MPN_DECR_U (lp, mn, 2); + } + else + lp[0] -= 2; + } while (--count != 0); + return 0; +} + +/* Store the Lucas' number L[n] at lp (maybe), computed modulo m. lp + and scratch should have room for mn*2+1 limbs. + + Returns the size of L[n] normally. + + If F[n] is zero modulo m, or L[n] is, returns 0 and lp is + undefined. +*/ + +static mp_size_t +mpn_lucm (mp_ptr lp, mp_srcptr np, mp_size_t nn, mp_srcptr mp, mp_size_t mn, mp_ptr scratch) +{ + int neg; + mp_limb_t cy; + + ASSERT (! MPN_OVERLAP_P (lp, MAX(2*mn+1,5), scratch, MAX(2*mn+1,5))); + ASSERT (nn > 0); + + neg = mpn_fib2m (lp, scratch, np, nn, mp, mn); + + /* F[n] = +/-{lp, mn}, F[n-1] = +/-{scratch, mn} */ + if (mpn_zero_p (lp, mn)) + return 0; + + if (neg) /* One sign is opposite, use sub instead of add. */ + { +#if HAVE_NATIVE_mpn_rsblsh1_n || HAVE_NATIVE_mpn_sublsh1_n +#if HAVE_NATIVE_mpn_rsblsh1_n + cy = mpn_rsblsh1_n (lp, lp, scratch, mn); /* L[n] = +/-(2F[n-1]-(-F[n])) */ +#else + cy = mpn_sublsh1_n (lp, lp, scratch, mn); /* L[n] = -/+(F[n]-(-2F[n-1])) */ + if (cy != 0) + cy = mpn_add_n (lp, lp, mp, mn) - cy; +#endif + if (cy > 1) + cy += mpn_add_n (lp, lp, mp, mn); +#else + cy = mpn_lshift (scratch, scratch, mn, 1); /* 2F[n-1] */ + if (UNLIKELY (cy)) + cy -= mpn_sub_n (lp, scratch, lp, mn); /* L[n] = +/-(2F[n-1]-(-F[n])) */ + else + abs_sub_n (lp, lp, scratch, mn); +#endif + ASSERT (cy <= 1); + } + else + { +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (lp, lp, scratch, mn); /* L[n] = +/-(2F[n-1]+F[n])) */ +#else + cy = mpn_lshift (scratch, scratch, mn, 1); + cy+= mpn_add_n (lp, lp, scratch, mn); +#endif + ASSERT (cy <= 2); + } + while (cy || mpn_cmp (lp, mp, mn) >= 0) + cy -= mpn_sub_n (lp, lp, mp, mn); + MPN_NORMALIZE (lp, mn); + return mn; +} + +int +mpn_strongfibo (mp_srcptr mp, mp_size_t mn, mp_ptr scratch) +{ + mp_ptr lp, sp; + mp_size_t en; + mp_bitcnt_t b0; + TMP_DECL; + +#if GMP_NUMB_BITS % 4 == 0 + b0 = mpn_scan0 (mp, 0); +#else + { + mpz_t m = MPZ_ROINIT_N(mp, mn); + b0 = mpz_scan0 (m, 0); + } + if (UNLIKELY (b0 == mn * GMP_NUMB_BITS)) + { + en = 1; + scratch [0] = 1; + } + else +#endif + { + int cnt = b0 % GMP_NUMB_BITS; + en = b0 / GMP_NUMB_BITS; + if (LIKELY (cnt != 0)) + mpn_rshift (scratch, mp + en, mn - en, cnt); + else + MPN_COPY (scratch, mp + en, mn - en); + en = mn - en; + scratch [0] |= 1; + en -= scratch [en - 1] == 0; + } + TMP_MARK; + + lp = TMP_ALLOC_LIMBS (4 * mn + 6); + sp = lp + 2 * mn + 3; + en = mpn_lucm (sp, scratch, en, mp, mn, lp); + if (en != 0 && LIKELY (--b0 != 0)) + { + mpn_sqr (lp, sp, en); + lp [0] |= 2; /* V^2 + 2 */ + if (LIKELY (2 * en >= mn)) + mpn_tdiv_qr (sp, lp, 0, lp, 2 * en, mp, mn); + else + MPN_ZERO (lp + 2 * en, mn - 2 * en); + if (! mpn_zero_p (lp, mn) && LIKELY (--b0 != 0)) + b0 = mpn_llriter (lp, mp, mn, b0, lp + mn + 1); + } + TMP_FREE; + return (b0 != 0); +} diff --git a/gmp-6.3.0/mpn/generic/sub.c b/gmp-6.3.0/mpn/generic/sub.c new file mode 100644 index 0000000..df0afd6 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sub.c @@ -0,0 +1,33 @@ +/* mpn_sub - subtract mpn from mpn. + +Copyright 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __GMP_FORCE_mpn_sub 1 + +#include "gmp-impl.h" diff --git a/gmp-6.3.0/mpn/generic/sub_1.c b/gmp-6.3.0/mpn/generic/sub_1.c new file mode 100644 index 0000000..a20f191 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sub_1.c @@ -0,0 +1,33 @@ +/* mpn_sub_1 - subtract limb from mpn. + +Copyright 2001 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __GMP_FORCE_mpn_sub_1 1 + +#include "gmp-impl.h" diff --git a/gmp-6.3.0/mpn/generic/sub_err1_n.c b/gmp-6.3.0/mpn/generic/sub_err1_n.c new file mode 100644 index 0000000..beca57e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sub_err1_n.c @@ -0,0 +1,100 @@ +/* mpn_sub_err1_n -- sub_n with one error term + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* + Computes: + + (1) {rp,n} := {up,n} - {vp,n} (just like mpn_sub_n) with incoming borrow cy, + return value is borrow out. + + (2) Let c[i+1] = borrow from i-th limb subtraction (c[0] = cy). + Computes c[1]*yp[n-1] + ... + c[n]*yp[0], stores two-limb result at ep. + + Requires n >= 1. + + None of the outputs may overlap each other or any of the inputs, except + that {rp,n} may be equal to {up,n} or {vp,n}. +*/ +mp_limb_t +mpn_sub_err1_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, + mp_ptr ep, mp_srcptr yp, + mp_size_t n, mp_limb_t cy) +{ + mp_limb_t el, eh, ul, vl, yl, zl, rl, sl, cy1, cy2; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 2, up, n)); + ASSERT (! MPN_OVERLAP_P (ep, 2, vp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 2, yp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 2, rp, n)); + + yp += n - 1; + el = eh = 0; + + do + { + yl = *yp--; + ul = *up++; + vl = *vp++; + + /* ordinary sub_n */ + SUBC_LIMB (cy1, sl, ul, vl); + SUBC_LIMB (cy2, rl, sl, cy); + cy = cy1 | cy2; + *rp++ = rl; + + /* update (eh:el) */ + zl = (-cy) & yl; + el += zl; + eh += el < zl; + } + while (--n); + +#if GMP_NAIL_BITS != 0 + eh = (eh << GMP_NAIL_BITS) + (el >> GMP_NUMB_BITS); + el &= GMP_NUMB_MASK; +#endif + + ep[0] = el; + ep[1] = eh; + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/sub_err2_n.c b/gmp-6.3.0/mpn/generic/sub_err2_n.c new file mode 100644 index 0000000..1edf8d6 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sub_err2_n.c @@ -0,0 +1,116 @@ +/* mpn_sub_err2_n -- sub_n with two error terms + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* + Computes: + + (1) {rp,n} := {up,n} - {vp,n} (just like mpn_sub_n) with incoming borrow cy, + return value is borrow out. + + (2) Let c[i+1] = borrow from i-th limb subtraction (c[0] = cy). + Computes c[1]*yp1[n-1] + ... + c[n]*yp1[0], + c[1]*yp2[n-1] + ... + c[n]*yp2[0], + stores two-limb results at {ep,2} and {ep+2,2} respectively. + + Requires n >= 1. + + None of the outputs may overlap each other or any of the inputs, except + that {rp,n} may be equal to {up,n} or {vp,n}. +*/ +mp_limb_t +mpn_sub_err2_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, + mp_ptr ep, mp_srcptr yp1, mp_srcptr yp2, + mp_size_t n, mp_limb_t cy) +{ + mp_limb_t el1, eh1, el2, eh2, ul, vl, yl1, yl2, zl1, zl2, rl, sl, cy1, cy2; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp1, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp2, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, up, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, vp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, yp1, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, yp2, n)); + ASSERT (! MPN_OVERLAP_P (ep, 4, rp, n)); + + yp1 += n - 1; + yp2 += n - 1; + el1 = eh1 = 0; + el2 = eh2 = 0; + + do + { + yl1 = *yp1--; + yl2 = *yp2--; + ul = *up++; + vl = *vp++; + + /* ordinary sub_n */ + SUBC_LIMB (cy1, sl, ul, vl); + SUBC_LIMB (cy2, rl, sl, cy); + cy = cy1 | cy2; + *rp++ = rl; + + /* update (eh1:el1) */ + zl1 = (-cy) & yl1; + el1 += zl1; + eh1 += el1 < zl1; + + /* update (eh2:el2) */ + zl2 = (-cy) & yl2; + el2 += zl2; + eh2 += el2 < zl2; + } + while (--n); + +#if GMP_NAIL_BITS != 0 + eh1 = (eh1 << GMP_NAIL_BITS) + (el1 >> GMP_NUMB_BITS); + el1 &= GMP_NUMB_MASK; + eh2 = (eh2 << GMP_NAIL_BITS) + (el2 >> GMP_NUMB_BITS); + el2 &= GMP_NUMB_MASK; +#endif + + ep[0] = el1; + ep[1] = eh1; + ep[2] = el2; + ep[3] = eh2; + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/sub_err3_n.c b/gmp-6.3.0/mpn/generic/sub_err3_n.c new file mode 100644 index 0000000..2db3c63 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sub_err3_n.c @@ -0,0 +1,131 @@ +/* mpn_sub_err3_n -- sub_n with three error terms + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* + Computes: + + (1) {rp,n} := {up,n} - {vp,n} (just like mpn_sub_n) with incoming borrow cy, + return value is borrow out. + + (2) Let c[i+1] = borrow from i-th limb subtraction (c[0] = cy). + Computes c[1]*yp1[n-1] + ... + c[n]*yp1[0], + c[1]*yp2[n-1] + ... + c[n]*yp2[0], + c[1]*yp3[n-1] + ... + c[n]*yp3[0], + stores two-limb results at {ep,2}, {ep+2,2} and {ep+4,2} respectively. + + Requires n >= 1. + + None of the outputs may overlap each other or any of the inputs, except + that {rp,n} may be equal to {up,n} or {vp,n}. +*/ +mp_limb_t +mpn_sub_err3_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, + mp_ptr ep, mp_srcptr yp1, mp_srcptr yp2, mp_srcptr yp3, + mp_size_t n, mp_limb_t cy) +{ + mp_limb_t el1, eh1, el2, eh2, el3, eh3, ul, vl, yl1, yl2, yl3, zl1, zl2, zl3, rl, sl, cy1, cy2; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, vp, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp1, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp2, n)); + ASSERT (! MPN_OVERLAP_P (rp, n, yp3, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, up, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, vp, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, yp1, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, yp2, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, yp3, n)); + ASSERT (! MPN_OVERLAP_P (ep, 6, rp, n)); + + yp1 += n - 1; + yp2 += n - 1; + yp3 += n - 1; + el1 = eh1 = 0; + el2 = eh2 = 0; + el3 = eh3 = 0; + + do + { + yl1 = *yp1--; + yl2 = *yp2--; + yl3 = *yp3--; + ul = *up++; + vl = *vp++; + + /* ordinary sub_n */ + SUBC_LIMB (cy1, sl, ul, vl); + SUBC_LIMB (cy2, rl, sl, cy); + cy = cy1 | cy2; + *rp++ = rl; + + /* update (eh1:el1) */ + zl1 = (-cy) & yl1; + el1 += zl1; + eh1 += el1 < zl1; + + /* update (eh2:el2) */ + zl2 = (-cy) & yl2; + el2 += zl2; + eh2 += el2 < zl2; + + /* update (eh3:el3) */ + zl3 = (-cy) & yl3; + el3 += zl3; + eh3 += el3 < zl3; + } + while (--n); + +#if GMP_NAIL_BITS != 0 + eh1 = (eh1 << GMP_NAIL_BITS) + (el1 >> GMP_NUMB_BITS); + el1 &= GMP_NUMB_MASK; + eh2 = (eh2 << GMP_NAIL_BITS) + (el2 >> GMP_NUMB_BITS); + el2 &= GMP_NUMB_MASK; + eh3 = (eh3 << GMP_NAIL_BITS) + (el3 >> GMP_NUMB_BITS); + el3 &= GMP_NUMB_MASK; +#endif + + ep[0] = el1; + ep[1] = eh1; + ep[2] = el2; + ep[3] = eh2; + ep[4] = el3; + ep[5] = eh3; + + return cy; +} diff --git a/gmp-6.3.0/mpn/generic/sub_n.c b/gmp-6.3.0/mpn/generic/sub_n.c new file mode 100644 index 0000000..b192c96 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/sub_n.c @@ -0,0 +1,89 @@ +/* mpn_sub_n -- Subtract equal length limb vectors. + +Copyright 1992-1994, 1996, 2000, 2002, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +#if GMP_NAIL_BITS == 0 + +mp_limb_t +mpn_sub_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t ul, vl, sl, rl, cy, cy1, cy2; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_INCR_P (rp, up, n)); + ASSERT (MPN_SAME_OR_INCR_P (rp, vp, n)); + + cy = 0; + do + { + ul = *up++; + vl = *vp++; + sl = ul - vl; + cy1 = sl > ul; + rl = sl - cy; + cy2 = rl > sl; + cy = cy1 | cy2; + *rp++ = rl; + } + while (--n != 0); + + return cy; +} + +#endif + +#if GMP_NAIL_BITS >= 1 + +mp_limb_t +mpn_sub_n (mp_ptr rp, mp_srcptr up, mp_srcptr vp, mp_size_t n) +{ + mp_limb_t ul, vl, rl, cy; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_INCR_P (rp, up, n)); + ASSERT (MPN_SAME_OR_INCR_P (rp, vp, n)); + + cy = 0; + do + { + ul = *up++; + vl = *vp++; + rl = ul - vl - cy; + cy = rl >> (GMP_LIMB_BITS - 1); + *rp++ = rl & GMP_NUMB_MASK; + } + while (--n != 0); + + return cy; +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/submul_1.c b/gmp-6.3.0/mpn/generic/submul_1.c new file mode 100644 index 0000000..4744274 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/submul_1.c @@ -0,0 +1,144 @@ +/* mpn_submul_1 -- multiply the N long limb vector pointed to by UP by VL, + subtract the N least significant limbs of the product from the limb + vector pointed to by RP. Return the most significant limb of the + product, adjusted for carry-out from the subtraction. + +Copyright 1992-1994, 1996, 2000, 2002, 2004 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +#if GMP_NAIL_BITS == 0 + +mp_limb_t +mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) +{ + mp_limb_t u0, crec, c, p1, p0, r0; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + + crec = 0; + do + { + u0 = *up++; + umul_ppmm (p1, p0, u0, v0); + + r0 = *rp; + + p0 = r0 - p0; + c = r0 < p0; + + p1 = p1 + c; + + r0 = p0 - crec; /* cycle 0, 3, ... */ + c = p0 < r0; /* cycle 1, 4, ... */ + + crec = p1 + c; /* cycle 2, 5, ... */ + + *rp++ = r0; + } + while (--n != 0); + + return crec; +} + +#endif + +#if GMP_NAIL_BITS == 1 + +mp_limb_t +mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) +{ + mp_limb_t shifted_v0, u0, r0, p0, p1, prev_p1, cl, xl, c1, c2, c3; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT_MPN (rp, n); + ASSERT_MPN (up, n); + ASSERT_LIMB (v0); + + shifted_v0 = v0 << GMP_NAIL_BITS; + cl = 0; + prev_p1 = 0; + do + { + u0 = *up++; + r0 = *rp; + umul_ppmm (p1, p0, u0, shifted_v0); + p0 >>= GMP_NAIL_BITS; + SUBC_LIMB (c1, xl, r0, prev_p1); + SUBC_LIMB (c2, xl, xl, p0); + SUBC_LIMB (c3, xl, xl, cl); + cl = c1 + c2 + c3; + *rp++ = xl; + prev_p1 = p1; + } + while (--n != 0); + + return prev_p1 + cl; +} + +#endif + +#if GMP_NAIL_BITS >= 2 + +mp_limb_t +mpn_submul_1 (mp_ptr rp, mp_srcptr up, mp_size_t n, mp_limb_t v0) +{ + mp_limb_t shifted_v0, u0, r0, p0, p1, prev_p1, xw, cl, xl; + + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, n)); + ASSERT_MPN (rp, n); + ASSERT_MPN (up, n); + ASSERT_LIMB (v0); + + shifted_v0 = v0 << GMP_NAIL_BITS; + cl = 0; + prev_p1 = 0; + do + { + u0 = *up++; + r0 = *rp; + umul_ppmm (p1, p0, u0, shifted_v0); + p0 >>= GMP_NAIL_BITS; + xw = r0 - (prev_p1 + p0) + cl; + cl = (mp_limb_signed_t) xw >> GMP_NUMB_BITS; /* FIXME: non-portable */ + xl = xw & GMP_NUMB_MASK; + *rp++ = xl; + prev_p1 = p1; + } + while (--n != 0); + + return prev_p1 - cl; +} + +#endif diff --git a/gmp-6.3.0/mpn/generic/tdiv_qr.c b/gmp-6.3.0/mpn/generic/tdiv_qr.c new file mode 100644 index 0000000..92ff33c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/tdiv_qr.c @@ -0,0 +1,386 @@ +/* mpn_tdiv_qr -- Divide the numerator (np,nn) by the denominator (dp,dn) and + write the nn-dn+1 quotient limbs at qp and the dn remainder limbs at rp. If + qxn is non-zero, generate that many fraction limbs and append them after the + other quotient limbs, and update the remainder accordingly. The input + operands are unaffected. + + Preconditions: + 1. The most significant limb of the divisor must be non-zero. + 2. nn >= dn, even if qxn is non-zero. (??? relax this ???) + + The time complexity of this is O(qn*qn+M(dn,qn)), where M(m,n) is the time + complexity of multiplication. + +Copyright 1997, 2000-2002, 2005, 2009, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + + +void +mpn_tdiv_qr (mp_ptr qp, mp_ptr rp, mp_size_t qxn, + mp_srcptr np, mp_size_t nn, mp_srcptr dp, mp_size_t dn) +{ + ASSERT_ALWAYS (qxn == 0); + + ASSERT (nn >= 0); + ASSERT (dn >= 0); + ASSERT (dn == 0 || dp[dn - 1] != 0); + ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1 + qxn, np, nn)); + ASSERT (! MPN_OVERLAP_P (qp, nn - dn + 1 + qxn, dp, dn)); + + switch (dn) + { + case 0: + DIVIDE_BY_ZERO; + + case 1: + { + rp[0] = mpn_divrem_1 (qp, (mp_size_t) 0, np, nn, dp[0]); + return; + } + + case 2: + { + mp_ptr n2p; + mp_limb_t qhl, cy; + TMP_DECL; + TMP_MARK; + if ((dp[1] & GMP_NUMB_HIGHBIT) == 0) + { + int cnt; + mp_limb_t d2p[2]; + count_leading_zeros (cnt, dp[1]); + cnt -= GMP_NAIL_BITS; + d2p[1] = (dp[1] << cnt) | (dp[0] >> (GMP_NUMB_BITS - cnt)); + d2p[0] = (dp[0] << cnt) & GMP_NUMB_MASK; + n2p = TMP_ALLOC_LIMBS (nn + 1); + cy = mpn_lshift (n2p, np, nn, cnt); + n2p[nn] = cy; + qhl = mpn_divrem_2 (qp, 0L, n2p, nn + (cy != 0), d2p); + if (cy == 0) + qp[nn - 2] = qhl; /* always store nn-2+1 quotient limbs */ + rp[0] = (n2p[0] >> cnt) + | ((n2p[1] << (GMP_NUMB_BITS - cnt)) & GMP_NUMB_MASK); + rp[1] = (n2p[1] >> cnt); + } + else + { + n2p = TMP_ALLOC_LIMBS (nn); + MPN_COPY (n2p, np, nn); + qhl = mpn_divrem_2 (qp, 0L, n2p, nn, dp); + qp[nn - 2] = qhl; /* always store nn-2+1 quotient limbs */ + rp[0] = n2p[0]; + rp[1] = n2p[1]; + } + TMP_FREE; + return; + } + + default: + { + int adjust; + gmp_pi1_t dinv; + TMP_DECL; + TMP_MARK; + adjust = np[nn - 1] >= dp[dn - 1]; /* conservative tests for quotient size */ + if (nn + adjust >= 2 * dn) + { + mp_ptr n2p, d2p; + mp_limb_t cy; + int cnt; + + qp[nn - dn] = 0; /* zero high quotient limb */ + if ((dp[dn - 1] & GMP_NUMB_HIGHBIT) == 0) /* normalize divisor */ + { + count_leading_zeros (cnt, dp[dn - 1]); + cnt -= GMP_NAIL_BITS; + d2p = TMP_ALLOC_LIMBS (dn); + mpn_lshift (d2p, dp, dn, cnt); + n2p = TMP_ALLOC_LIMBS (nn + 1); + cy = mpn_lshift (n2p, np, nn, cnt); + n2p[nn] = cy; + nn += adjust; + } + else + { + cnt = 0; + d2p = (mp_ptr) dp; + n2p = TMP_ALLOC_LIMBS (nn + 1); + MPN_COPY (n2p, np, nn); + n2p[nn] = 0; + nn += adjust; + } + + invert_pi1 (dinv, d2p[dn - 1], d2p[dn - 2]); + if (BELOW_THRESHOLD (dn, DC_DIV_QR_THRESHOLD)) + mpn_sbpi1_div_qr (qp, n2p, nn, d2p, dn, dinv.inv32); + else if (BELOW_THRESHOLD (dn, MUPI_DIV_QR_THRESHOLD) || /* fast condition */ + BELOW_THRESHOLD (nn, 2 * MU_DIV_QR_THRESHOLD) || /* fast condition */ + (double) (2 * (MU_DIV_QR_THRESHOLD - MUPI_DIV_QR_THRESHOLD)) * dn /* slow... */ + + (double) MUPI_DIV_QR_THRESHOLD * nn > (double) dn * nn) /* ...condition */ + mpn_dcpi1_div_qr (qp, n2p, nn, d2p, dn, &dinv); + else + { + mp_size_t itch = mpn_mu_div_qr_itch (nn, dn, 0); + mp_ptr scratch = TMP_ALLOC_LIMBS (itch); + mpn_mu_div_qr (qp, rp, n2p, nn, d2p, dn, scratch); + n2p = rp; + } + + if (cnt != 0) + mpn_rshift (rp, n2p, dn, cnt); + else + MPN_COPY (rp, n2p, dn); + TMP_FREE; + return; + } + + /* When we come here, the numerator/partial remainder is less + than twice the size of the denominator. */ + + { + /* Problem: + + Divide a numerator N with nn limbs by a denominator D with dn + limbs forming a quotient of qn=nn-dn+1 limbs. When qn is small + compared to dn, conventional division algorithms perform poorly. + We want an algorithm that has an expected running time that is + dependent only on qn. + + Algorithm (very informally stated): + + 1) Divide the 2 x qn most significant limbs from the numerator + by the qn most significant limbs from the denominator. Call + the result qest. This is either the correct quotient, but + might be 1 or 2 too large. Compute the remainder from the + division. (This step is implemented by an mpn_divrem call.) + + 2) Is the most significant limb from the remainder < p, where p + is the product of the most significant limb from the quotient + and the next(d)? (Next(d) denotes the next ignored limb from + the denominator.) If it is, decrement qest, and adjust the + remainder accordingly. + + 3) Is the remainder >= qest? If it is, qest is the desired + quotient. The algorithm terminates. + + 4) Subtract qest x next(d) from the remainder. If there is + borrow out, decrement qest, and adjust the remainder + accordingly. + + 5) Skip one word from the denominator (i.e., let next(d) denote + the next less significant limb. */ + + mp_size_t qn; + mp_ptr n2p, d2p; + mp_ptr tp; + mp_limb_t cy; + mp_size_t in, rn; + mp_limb_t quotient_too_large; + unsigned int cnt; + + qn = nn - dn; + qp[qn] = 0; /* zero high quotient limb */ + qn += adjust; /* qn cannot become bigger */ + + if (qn == 0) + { + MPN_COPY (rp, np, dn); + TMP_FREE; + return; + } + + in = dn - qn; /* (at least partially) ignored # of limbs in ops */ + /* Normalize denominator by shifting it to the left such that its + most significant bit is set. Then shift the numerator the same + amount, to mathematically preserve quotient. */ + if ((dp[dn - 1] & GMP_NUMB_HIGHBIT) == 0) + { + count_leading_zeros (cnt, dp[dn - 1]); + cnt -= GMP_NAIL_BITS; + + d2p = TMP_ALLOC_LIMBS (qn); + mpn_lshift (d2p, dp + in, qn, cnt); + d2p[0] |= dp[in - 1] >> (GMP_NUMB_BITS - cnt); + + n2p = TMP_ALLOC_LIMBS (2 * qn + 1); + cy = mpn_lshift (n2p, np + nn - 2 * qn, 2 * qn, cnt); + if (adjust) + { + n2p[2 * qn] = cy; + n2p++; + } + else + { + n2p[0] |= np[nn - 2 * qn - 1] >> (GMP_NUMB_BITS - cnt); + } + } + else + { + cnt = 0; + d2p = (mp_ptr) dp + in; + + n2p = TMP_ALLOC_LIMBS (2 * qn + 1); + MPN_COPY (n2p, np + nn - 2 * qn, 2 * qn); + if (adjust) + { + n2p[2 * qn] = 0; + n2p++; + } + } + + /* Get an approximate quotient using the extracted operands. */ + if (qn == 1) + { + mp_limb_t q0, r0; + udiv_qrnnd (q0, r0, n2p[1], n2p[0] << GMP_NAIL_BITS, d2p[0] << GMP_NAIL_BITS); + n2p[0] = r0 >> GMP_NAIL_BITS; + qp[0] = q0; + } + else if (qn == 2) + mpn_divrem_2 (qp, 0L, n2p, 4L, d2p); /* FIXME: obsolete function */ + else + { + invert_pi1 (dinv, d2p[qn - 1], d2p[qn - 2]); + if (BELOW_THRESHOLD (qn, DC_DIV_QR_THRESHOLD)) + mpn_sbpi1_div_qr (qp, n2p, 2 * qn, d2p, qn, dinv.inv32); + else if (BELOW_THRESHOLD (qn, MU_DIV_QR_THRESHOLD)) + mpn_dcpi1_div_qr (qp, n2p, 2 * qn, d2p, qn, &dinv); + else + { + mp_size_t itch = mpn_mu_div_qr_itch (2 * qn, qn, 0); + mp_ptr scratch = TMP_ALLOC_LIMBS (itch); + mp_ptr r2p = rp; + if (np == r2p) /* If N and R share space, put ... */ + r2p += nn - qn; /* intermediate remainder at N's upper end. */ + mpn_mu_div_qr (qp, r2p, n2p, 2 * qn, d2p, qn, scratch); + MPN_COPY (n2p, r2p, qn); + } + } + + rn = qn; + /* Multiply the first ignored divisor limb by the most significant + quotient limb. If that product is > the partial remainder's + most significant limb, we know the quotient is too large. This + test quickly catches most cases where the quotient is too large; + it catches all cases where the quotient is 2 too large. */ + { + mp_limb_t dl, x; + mp_limb_t h, dummy; + + if (in - 2 < 0) + dl = 0; + else + dl = dp[in - 2]; + +#if GMP_NAIL_BITS == 0 + x = (dp[in - 1] << cnt) | ((dl >> 1) >> ((~cnt) % GMP_LIMB_BITS)); +#else + x = (dp[in - 1] << cnt) & GMP_NUMB_MASK; + if (cnt != 0) + x |= dl >> (GMP_NUMB_BITS - cnt); +#endif + umul_ppmm (h, dummy, x, qp[qn - 1] << GMP_NAIL_BITS); + + if (n2p[qn - 1] < h) + { + mp_limb_t cy; + + mpn_decr_u (qp, (mp_limb_t) 1); + cy = mpn_add_n (n2p, n2p, d2p, qn); + if (cy) + { + /* The partial remainder is safely large. */ + n2p[qn] = cy; + ++rn; + } + } + } + + quotient_too_large = 0; + if (cnt != 0) + { + mp_limb_t cy1, cy2; + + /* Append partially used numerator limb to partial remainder. */ + cy1 = mpn_lshift (n2p, n2p, rn, GMP_NUMB_BITS - cnt); + n2p[0] |= np[in - 1] & (GMP_NUMB_MASK >> cnt); + + /* Update partial remainder with partially used divisor limb. */ + cy2 = mpn_submul_1 (n2p, qp, qn, dp[in - 1] & (GMP_NUMB_MASK >> cnt)); + if (qn != rn) + { + ASSERT_ALWAYS (n2p[qn] >= cy2); + n2p[qn] -= cy2; + } + else + { + n2p[qn] = cy1 - cy2; /* & GMP_NUMB_MASK; */ + + quotient_too_large = (cy1 < cy2); + ++rn; + } + --in; + } + /* True: partial remainder now is neutral, i.e., it is not shifted up. */ + + tp = TMP_ALLOC_LIMBS (dn); + + if (in < qn) + { + if (in == 0) + { + MPN_COPY (rp, n2p, rn); + ASSERT_ALWAYS (rn == dn); + goto foo; + } + mpn_mul (tp, qp, qn, dp, in); + } + else + mpn_mul (tp, dp, in, qp, qn); + + cy = mpn_sub (n2p, n2p, rn, tp + in, qn); + MPN_COPY (rp + in, n2p, dn - in); + quotient_too_large |= cy; + cy = mpn_sub_n (rp, np, tp, in); + cy = mpn_sub_1 (rp + in, rp + in, rn, cy); + quotient_too_large |= cy; + foo: + if (quotient_too_large) + { + mpn_decr_u (qp, (mp_limb_t) 1); + mpn_add_n (rp, rp, dp, dn); + } + } + TMP_FREE; + return; + } + } +} diff --git a/gmp-6.3.0/mpn/generic/toom22_mul.c b/gmp-6.3.0/mpn/generic/toom22_mul.c new file mode 100644 index 0000000..da56014 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom22_mul.c @@ -0,0 +1,222 @@ +/* mpn_toom22_mul -- Multiply {ap,an} and {bp,bn} where an >= bn. Or more + accurately, bn <= an < 2bn. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2010, 2012, 2014, 2018, 2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -1, 0, +inf + + <-s--><--n--> + ____ ______ + |_a1_|___a0_| + |b1_|___b0_| + <-t-><--n--> + + v0 = a0 * b0 # A(0)*B(0) + vm1 = (a0- a1)*(b0- b1) # A(-1)*B(-1) + vinf= a1 * b1 # A(inf)*B(inf) +*/ + +#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY +#define MAYBE_mul_toom22 1 +#else +#define MAYBE_mul_toom22 \ + (MUL_TOOM33_THRESHOLD >= 2 * MUL_TOOM22_THRESHOLD) +#endif + +#define TOOM22_MUL_N_REC(p, a, b, n, ws) \ + do { \ + if (! MAYBE_mul_toom22 \ + || BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \ + mpn_mul_basecase (p, a, n, b, n); \ + else \ + mpn_toom22_mul (p, a, n, b, n, ws); \ + } while (0) + +/* Normally, this calls mul_basecase or toom22_mul. But when when the fraction + MUL_TOOM33_THRESHOLD / MUL_TOOM22_THRESHOLD is large, an initially small + relative unbalance will become a larger and larger relative unbalance with + each recursion (the difference s-t will be invariant over recursive calls). + Therefore, we need to call toom32_mul. FIXME: Suppress depending on + MUL_TOOM33_THRESHOLD / MUL_TOOM22_THRESHOLD and on MUL_TOOM22_THRESHOLD. */ +#define TOOM22_MUL_REC(p, a, an, b, bn, ws) \ + do { \ + if (! MAYBE_mul_toom22 \ + || BELOW_THRESHOLD (bn, MUL_TOOM22_THRESHOLD)) \ + mpn_mul_basecase (p, a, an, b, bn); \ + else if (4 * an < 5 * bn) \ + mpn_toom22_mul (p, a, an, b, bn, ws); \ + else \ + mpn_toom32_mul (p, a, an, b, bn, ws); \ + } while (0) + +void +mpn_toom22_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, + mp_ptr scratch) +{ + const int __gmpn_cpuvec_initialized = 1; + mp_size_t n, s, t; + int vm1_neg; + mp_limb_t cy, cy2; + mp_ptr asm1; + mp_ptr bsm1; + +#define a0 ap +#define a1 (ap + n) +#define b0 bp +#define b1 (bp + n) + + s = an >> 1; + n = an - s; + t = bn - n; + + ASSERT (an >= bn); + + ASSERT (0 < s && s <= n && (n - s) == (an & 1)); + ASSERT (0 < t && t <= s); + + asm1 = pp; + bsm1 = pp + n; + + vm1_neg = 0; + + /* Compute asm1. */ + if ((an & 1) == 0) /* s == n */ + { + if (mpn_cmp (a0, a1, n) < 0) + { + mpn_sub_n (asm1, a1, a0, n); + vm1_neg = 1; + } + else + { + mpn_sub_n (asm1, a0, a1, n); + } + } + else /* n - s == 1 */ + { + if (a0[s] == 0 && mpn_cmp (a0, a1, s) < 0) + { + mpn_sub_n (asm1, a1, a0, s); + asm1[s] = 0; + vm1_neg = 1; + } + else + { + asm1[s] = a0[s] - mpn_sub_n (asm1, a0, a1, s); + } + } + + /* Compute bsm1. */ + if (t == n) + { + if (mpn_cmp (b0, b1, n) < 0) + { + mpn_sub_n (bsm1, b1, b0, n); + vm1_neg ^= 1; + } + else + { + mpn_sub_n (bsm1, b0, b1, n); + } + } + else + { + if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) + { + mpn_sub_n (bsm1, b1, b0, t); + MPN_ZERO (bsm1 + t, n - t); + vm1_neg ^= 1; + } + else + { + mpn_sub (bsm1, b0, n, b1, t); + } + } + +#define v0 pp /* 2n */ +#define vinf (pp + 2 * n) /* s+t */ +#define vm1 scratch /* 2n */ +#define scratch_out scratch + 2 * n + + /* vm1, 2n limbs */ + TOOM22_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); + + if (s > t) TOOM22_MUL_REC (vinf, a1, s, b1, t, scratch_out); + else TOOM22_MUL_N_REC (vinf, a1, b1, s, scratch_out); + + /* v0, 2n limbs */ + TOOM22_MUL_N_REC (v0, ap, bp, n, scratch_out); + + /* H(v0) + L(vinf) */ + cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n); + + /* L(v0) + (H(v0) + L(vinf)) */ + cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n); + + /* (H(v0) + L(vinf)) + H(vinf) */ + cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + t - n); + + if (vm1_neg) + cy += mpn_add_n (pp + n, pp + n, vm1, 2 * n); + else { + cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n); + if (UNLIKELY (cy + 1 == 0)) { /* cy is negative */ + /* The total contribution of v0+vinf-vm1 can not be negative. */ +#if WANT_ASSERT + /* The borrow in cy stops the propagation of the carry cy2, */ + ASSERT (cy2 == 1); + cy += mpn_add_1 (pp + 2 * n, pp + 2 * n, n, cy2); + ASSERT (cy == 0); +#else + /* we simply fill the area with zeros. */ + MPN_FILL (pp + 2 * n, n, 0); + /* ASSERT (s + t == n || mpn_zero_p (pp + 3 * n, s + t - n)); */ +#endif + return; + } + } + + ASSERT (cy <= 2); + ASSERT (cy2 <= 2); + + MPN_INCR_U (pp + 2 * n, s + t, cy2); + /* if s+t==n, cy is zero, but we should not access pp[3*n] at all. */ + MPN_INCR_U (pp + 3 * n, s + t - n, cy); +} diff --git a/gmp-6.3.0/mpn/generic/toom2_sqr.c b/gmp-6.3.0/mpn/generic/toom2_sqr.c new file mode 100644 index 0000000..db7a846 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom2_sqr.c @@ -0,0 +1,155 @@ +/* mpn_toom2_sqr -- Square {ap,an}. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2010, 2012, 2014, 2018, 2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -1, 0, +inf + + <-s--><--n--> + ____ ______ + |_a1_|___a0_| + + v0 = a0 ^2 # A(0)^2 + vm1 = (a0- a1)^2 # A(-1)^2 + vinf= a1 ^2 # A(inf)^2 +*/ + +#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY +#define MAYBE_sqr_toom2 1 +#else +#define MAYBE_sqr_toom2 \ + (SQR_TOOM3_THRESHOLD >= 2 * SQR_TOOM2_THRESHOLD) +#endif + +#define TOOM2_SQR_REC(p, a, n, ws) \ + do { \ + if (! MAYBE_sqr_toom2 \ + || BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) \ + mpn_sqr_basecase (p, a, n); \ + else \ + mpn_toom2_sqr (p, a, n, ws); \ + } while (0) + +void +mpn_toom2_sqr (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_ptr scratch) +{ + const int __gmpn_cpuvec_initialized = 1; + mp_size_t n, s; + mp_limb_t cy, cy2; + mp_ptr asm1; + +#define a0 ap +#define a1 (ap + n) + + s = an >> 1; + n = an - s; + + ASSERT (0 < s && s <= n && (n - s) == (an & 1)); + + asm1 = pp; + + /* Compute asm1. */ + if ((an & 1) == 0) /* s == n */ + { + if (mpn_cmp (a0, a1, n) < 0) + { + mpn_sub_n (asm1, a1, a0, n); + } + else + { + mpn_sub_n (asm1, a0, a1, n); + } + } + else /* n - s == 1 */ + { + if (a0[s] == 0 && mpn_cmp (a0, a1, s) < 0) + { + mpn_sub_n (asm1, a1, a0, s); + asm1[s] = 0; + } + else + { + asm1[s] = a0[s] - mpn_sub_n (asm1, a0, a1, s); + } + } + +#define v0 pp /* 2n */ +#define vinf (pp + 2 * n) /* s+s */ +#define vm1 scratch /* 2n */ +#define scratch_out scratch + 2 * n + + /* vm1, 2n limbs */ + TOOM2_SQR_REC (vm1, asm1, n, scratch_out); + + /* vinf, s+s limbs */ + TOOM2_SQR_REC (vinf, a1, s, scratch_out); + + /* v0, 2n limbs */ + TOOM2_SQR_REC (v0, ap, n, scratch_out); + + /* H(v0) + L(vinf) */ + cy = mpn_add_n (pp + 2 * n, v0 + n, vinf, n); + + /* L(v0) + H(v0) */ + cy2 = cy + mpn_add_n (pp + n, pp + 2 * n, v0, n); + + /* L(vinf) + H(vinf) */ + cy += mpn_add (pp + 2 * n, pp + 2 * n, n, vinf + n, s + s - n); + + cy -= mpn_sub_n (pp + n, pp + n, vm1, 2 * n); + + ASSERT (cy + 1 <= 3); + ASSERT (cy2 <= 2); + + if (LIKELY (cy <= 2)) { + MPN_INCR_U (pp + 2 * n, s + s, cy2); + MPN_INCR_U (pp + 3 * n, s + s - n, cy); + } else { /* cy is negative */ + /* The total contribution of v0+vinf-vm1 can not be negative. */ +#if WANT_ASSERT + /* The borrow in cy stops the propagation of the carry cy2, */ + ASSERT (cy2 == 1); + cy += mpn_add_1 (pp + 2 * n, pp + 2 * n, n, cy2); + ASSERT (cy == 0); +#else + /* we simply fill the area with zeros. */ + MPN_FILL (pp + 2 * n, n, 0); +#endif + } +} diff --git a/gmp-6.3.0/mpn/generic/toom32_mul.c b/gmp-6.3.0/mpn/generic/toom32_mul.c new file mode 100644 index 0000000..1139d17 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom32_mul.c @@ -0,0 +1,320 @@ +/* mpn_toom32_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 1.5 + times as large as bn. Or more accurately, bn < an < 3bn. + + Contributed to the GNU project by Torbjorn Granlund. + Improvements by Marco Bodrato and Niels Möller. + + The idea of applying Toom to unbalanced multiplication is due to Marco + Bodrato and Alberto Zanoni. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2010, 2020, 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -1, 0, +1, +inf + + <-s-><--n--><--n--> + ___ ______ ______ + |a2_|___a1_|___a0_| + |_b1_|___b0_| + <-t--><--n--> + + v0 = a0 * b0 # A(0)*B(0) + v1 = (a0+ a1+ a2)*(b0+ b1) # A(1)*B(1) ah <= 2 bh <= 1 + vm1 = (a0- a1+ a2)*(b0- b1) # A(-1)*B(-1) |ah| <= 1 bh = 0 + vinf= a2 * b1 # A(inf)*B(inf) +*/ + +#define TOOM32_MUL_N_REC(p, a, b, n, ws) \ + do { \ + mpn_mul_n (p, a, b, n); \ + } while (0) + +void +mpn_toom32_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, + mp_ptr scratch) +{ + mp_size_t n, s, t; + int vm1_neg; + mp_limb_t cy; + mp_limb_signed_t hi; + mp_limb_t ap1_hi, bp1_hi; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2 * n) +#define b0 bp +#define b1 (bp + n) + + /* Required, to ensure that s + t >= n. */ + ASSERT (bn + 2 <= an && an + 6 <= 3*bn); + + n = 2 * an >= 3 * bn ? (an + 2) / (size_t) 3 : (bn + 1) >> 1; + + s = an - 2 * n; + t = bn - n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + ASSERT (s + t >= n); + + /* Product area of size an + bn = 3*n + s + t >= 4*n + 2. */ +#define ap1 (pp) /* n, most significant limb in ap1_hi */ +#define bp1 (pp + n) /* n, most significant bit in bp1_hi */ +#define am1 (pp + 2*n) /* n, most significant bit in hi */ +#define bm1 (pp + 3*n) /* n */ +#define v1 (scratch) /* 2n + 1 */ +#define vm1 (pp) /* 2n + 1 */ +#define scratch_out (scratch + 2*n + 1) /* Currently unused. */ + + /* Scratch need: 2*n + 1 + scratch for the recursive multiplications. */ + + /* FIXME: Keep v1[2*n] and vm1[2*n] in scalar variables? */ + + /* Compute ap1 = a0 + a1 + a2, am1 = a0 - a1 + a2 */ + ap1_hi = mpn_add (ap1, a0, n, a2, s); +#if HAVE_NATIVE_mpn_add_n_sub_n + if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0) + { + ap1_hi = mpn_add_n_sub_n (ap1, am1, a1, ap1, n) >> 1; + hi = 0; + vm1_neg = 1; + } + else + { + cy = mpn_add_n_sub_n (ap1, am1, ap1, a1, n); + hi = ap1_hi - (cy & 1); + ap1_hi += (cy >> 1); + vm1_neg = 0; + } +#else + if (ap1_hi == 0 && mpn_cmp (ap1, a1, n) < 0) + { + ASSERT_NOCARRY (mpn_sub_n (am1, a1, ap1, n)); + hi = 0; + vm1_neg = 1; + } + else + { + hi = ap1_hi - mpn_sub_n (am1, ap1, a1, n); + vm1_neg = 0; + } + ap1_hi += mpn_add_n (ap1, ap1, a1, n); +#endif + + /* Compute bp1 = b0 + b1 and bm1 = b0 - b1. */ + if (t == n) + { +#if HAVE_NATIVE_mpn_add_n_sub_n + if (mpn_cmp (b0, b1, n) < 0) + { + cy = mpn_add_n_sub_n (bp1, bm1, b1, b0, n); + vm1_neg ^= 1; + } + else + { + cy = mpn_add_n_sub_n (bp1, bm1, b0, b1, n); + } + bp1_hi = cy >> 1; +#else + bp1_hi = mpn_add_n (bp1, b0, b1, n); + + if (mpn_cmp (b0, b1, n) < 0) + { + ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, n)); + vm1_neg ^= 1; + } + else + { + ASSERT_NOCARRY (mpn_sub_n (bm1, b0, b1, n)); + } +#endif + } + else + { + /* FIXME: Should still use mpn_add_n_sub_n for the main part. */ + bp1_hi = mpn_add (bp1, b0, n, b1, t); + + if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) + { + ASSERT_NOCARRY (mpn_sub_n (bm1, b1, b0, t)); + MPN_ZERO (bm1 + t, n - t); + vm1_neg ^= 1; + } + else + { + ASSERT_NOCARRY (mpn_sub (bm1, b0, n, b1, t)); + } + } + + TOOM32_MUL_N_REC (v1, ap1, bp1, n, scratch_out); + if (ap1_hi == 1) + { + cy = mpn_add_n (v1 + n, v1 + n, bp1, n); + } + else if (ap1_hi > 1) /* ap1_hi == 2 */ + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy = mpn_addlsh1_n_ip1 (v1 + n, bp1, n); +#else + cy = mpn_addmul_1 (v1 + n, bp1, n, CNST_LIMB(2)); +#endif + } + else + cy = 0; + if (bp1_hi != 0) + cy += ap1_hi + mpn_add_n (v1 + n, v1 + n, ap1, n); + v1[2 * n] = cy; + + TOOM32_MUL_N_REC (vm1, am1, bm1, n, scratch_out); + if (hi) + hi = mpn_add_n (vm1+n, vm1+n, bm1, n); + + vm1[2*n] = hi; + + /* v1 <-- (v1 + vm1) / 2 = x0 + x2 */ + if (vm1_neg) + { +#if HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (v1, v1, vm1, 2*n+1); +#else + mpn_sub_n (v1, v1, vm1, 2*n+1); + ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1)); +#endif + } + else + { +#if HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (v1, v1, vm1, 2*n+1); +#else + mpn_add_n (v1, v1, vm1, 2*n+1); + ASSERT_NOCARRY (mpn_rshift (v1, v1, 2*n+1, 1)); +#endif + } + + /* We get x1 + x3 = (x0 + x2) - (x0 - x1 + x2 - x3), and hence + + y = x1 + x3 + (x0 + x2) * B + = (x0 + x2) * B + (x0 + x2) - vm1. + + y is 3*n + 1 limbs, y = y0 + y1 B + y2 B^2. We store them as + follows: y0 at scratch, y1 at pp + 2*n, and y2 at scratch + n + (already in place, except for carry propagation). + + We thus add + + B^3 B^2 B 1 + | | | | + +-----+----+ + + | x0 + x2 | + +----+-----+----+ + + | x0 + x2 | + +----------+ + - | vm1 | + --+----++----+----+- + | y2 | y1 | y0 | + +-----+----+----+ + + Since we store y0 at the same location as the low half of x0 + x2, we + need to do the middle sum first. */ + + hi = vm1[2*n]; + cy = mpn_add_n (pp + 2*n, v1, v1 + n, n); + MPN_INCR_U (v1 + n, n + 1, cy + v1[2*n]); + + /* FIXME: Can we get rid of this second vm1_neg conditional by + swapping the location of +1 and -1 values? */ + if (vm1_neg) + { + cy = mpn_add_n (v1, v1, vm1, n); + hi += mpn_add_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy); + MPN_INCR_U (v1 + n, n+1, hi); + } + else + { + cy = mpn_sub_n (v1, v1, vm1, n); + hi += mpn_sub_nc (pp + 2*n, pp + 2*n, vm1 + n, n, cy); + MPN_DECR_U (v1 + n, n+1, hi); + } + + TOOM32_MUL_N_REC (pp, a0, b0, n, scratch_out); + /* vinf, s+t limbs. Use mpn_mul for now, to handle unbalanced operands */ + if (s > t) mpn_mul (pp+3*n, a2, s, b1, t); + else mpn_mul (pp+3*n, b1, t, a2, s); + + /* Remaining interpolation. + + y * B + x0 + x3 B^3 - x0 B^2 - x3 B + = (x1 + x3) B + (x0 + x2) B^2 + x0 + x3 B^3 - x0 B^2 - x3 B + = y0 B + y1 B^2 + y3 B^3 + Lx0 + H x0 B + + L x3 B^3 + H x3 B^4 - Lx0 B^2 - H x0 B^3 - L x3 B - H x3 B^2 + = L x0 + (y0 + H x0 - L x3) B + (y1 - L x0 - H x3) B^2 + + (y2 - (H x0 - L x3)) B^3 + H x3 B^4 + + B^4 B^3 B^2 B 1 + | | | | | | + +-------+ +---------+---------+ + | Hx3 | | Hx0-Lx3 | Lx0 | + +------+----------+---------+---------+---------+ + | y2 | y1 | y0 | + ++---------+---------+---------+ + -| Hx0-Lx3 | - Lx0 | + +---------+---------+ + | - Hx3 | + +--------+ + + We must take into account the carry from Hx0 - Lx3. + */ + + cy = mpn_sub_n (pp + n, pp + n, pp+3*n, n); + hi = scratch[2*n] + cy; + + cy = mpn_sub_nc (pp + 2*n, pp + 2*n, pp, n, cy); + hi -= mpn_sub_nc (pp + 3*n, scratch + n, pp + n, n, cy); + + hi += mpn_add (pp + n, pp + n, 3*n, scratch, n); + + /* FIXME: Is support for s + t == n needed? */ + if (LIKELY (s + t > n)) + { + hi -= mpn_sub (pp + 2*n, pp + 2*n, 2*n, pp + 4*n, s+t-n); + + ASSERT (hi >= 0); /* contribution of the middle terms >= 0 */ + MPN_INCR_U (pp + 4*n, s+t-n, hi); + } + else + ASSERT (hi == 0); +} diff --git a/gmp-6.3.0/mpn/generic/toom33_mul.c b/gmp-6.3.0/mpn/generic/toom33_mul.c new file mode 100644 index 0000000..54f055f --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom33_mul.c @@ -0,0 +1,316 @@ +/* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in + size. Or more accurately, bn <= an < (3/2)bn. + + Contributed to the GNU project by Torbjorn Granlund. + Additional improvements by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2008, 2010, 2012, 2015, 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -1, 0, +1, +2, +inf + + <-s--><--n--><--n--> + ____ ______ ______ + |_a2_|___a1_|___a0_| + |b2_|___b1_|___b0_| + <-t-><--n--><--n--> + + v0 = a0 * b0 # A(0)*B(0) + v1 = (a0+ a1+ a2)*(b0+ b1+ b2) # A(1)*B(1) ah <= 2 bh <= 2 + vm1 = (a0- a1+ a2)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 bh <= 1 + v2 = (a0+2a1+4a2)*(b0+2b1+4b2) # A(2)*B(2) ah <= 6 bh <= 6 + vinf= a2 * b2 # A(inf)*B(inf) +*/ + +#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY +#define MAYBE_mul_basecase 1 +#define MAYBE_mul_toom33 1 +#else +#define MAYBE_mul_basecase \ + (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD) +#define MAYBE_mul_toom33 \ + (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD) +#endif + +/* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced + multiplication at the infinity point. We may have + MAYBE_mul_basecase == 0, and still get s just below + MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get + s == 1 and mpn_toom22_mul will crash. +*/ + +#define TOOM33_MUL_N_REC(p, a, b, n, ws) \ + do { \ + if (MAYBE_mul_basecase \ + && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \ + mpn_mul_basecase (p, a, n, b, n); \ + else if (! MAYBE_mul_toom33 \ + || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \ + mpn_toom22_mul (p, a, n, b, n, ws); \ + else \ + mpn_toom33_mul (p, a, n, b, n, ws); \ + } while (0) + +void +mpn_toom33_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, + mp_ptr scratch) +{ + const int __gmpn_cpuvec_initialized = 1; + mp_size_t n, s, t; + int vm1_neg; + mp_limb_t cy, vinf0; + mp_ptr gp; + mp_ptr as1, asm1, as2; + mp_ptr bs1, bsm1, bs2; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2*n) +#define b0 bp +#define b1 (bp + n) +#define b2 (bp + 2*n) + + n = (an + 2) / (size_t) 3; + + s = an - 2 * n; + t = bn - 2 * n; + + ASSERT (an >= bn); + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + + as1 = scratch + 4 * n + 4; + asm1 = scratch + 2 * n + 2; + as2 = pp + n + 1; + + bs1 = pp; + bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */ + bs2 = pp + 2 * n + 2; + + gp = scratch; + + vm1_neg = 0; + + /* Compute as1 and asm1. */ + cy = mpn_add (gp, a0, n, a2, s); +#if HAVE_NATIVE_mpn_add_n_sub_n + if (cy == 0 && mpn_cmp (gp, a1, n) < 0) + { + cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n); + as1[n] = cy >> 1; + asm1[n] = 0; + vm1_neg = 1; + } + else + { + mp_limb_t cy2; + cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n); + as1[n] = cy + (cy2 >> 1); + asm1[n] = cy - (cy2 & 1); + } +#else + as1[n] = cy + mpn_add_n (as1, gp, a1, n); + if (cy == 0 && mpn_cmp (gp, a1, n) < 0) + { + mpn_sub_n (asm1, a1, gp, n); + asm1[n] = 0; + vm1_neg = 1; + } + else + { + cy -= mpn_sub_n (asm1, gp, a1, n); + asm1[n] = cy; + } +#endif + + /* Compute as2. */ +#if HAVE_NATIVE_mpn_rsblsh1_n + cy = mpn_add_n (as2, a2, as1, s); + if (s != n) + cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); + cy += as1[n]; + cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n); +#else +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (as2, a1, a2, s); + if (s != n) + cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy); + cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); +#else + cy = mpn_add_n (as2, a2, as1, s); + if (s != n) + cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); + cy += as1[n]; + cy = 2 * cy + mpn_lshift (as2, as2, n, 1); + cy -= mpn_sub_n (as2, as2, a0, n); +#endif +#endif + as2[n] = cy; + + /* Compute bs1 and bsm1. */ + cy = mpn_add (gp, b0, n, b2, t); +#if HAVE_NATIVE_mpn_add_n_sub_n + if (cy == 0 && mpn_cmp (gp, b1, n) < 0) + { + cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n); + bs1[n] = cy >> 1; + bsm1[n] = 0; + vm1_neg ^= 1; + } + else + { + mp_limb_t cy2; + cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n); + bs1[n] = cy + (cy2 >> 1); + bsm1[n] = cy - (cy2 & 1); + } +#else + bs1[n] = cy + mpn_add_n (bs1, gp, b1, n); + if (cy == 0 && mpn_cmp (gp, b1, n) < 0) + { + mpn_sub_n (bsm1, b1, gp, n); + bsm1[n] = 0; + vm1_neg ^= 1; + } + else + { + cy -= mpn_sub_n (bsm1, gp, b1, n); + bsm1[n] = cy; + } +#endif + + /* Compute bs2. */ +#if HAVE_NATIVE_mpn_rsblsh1_n + cy = mpn_add_n (bs2, b2, bs1, t); + if (t != n) + cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy); + cy += bs1[n]; + cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n); +#else +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (bs2, b1, b2, t); + if (t != n) + cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy); + cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n); +#else + cy = mpn_add_n (bs2, bs1, b2, t); + if (t != n) + cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy); + cy += bs1[n]; + cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1); + cy -= mpn_sub_n (bs2, bs2, b0, n); +#endif +#endif + bs2[n] = cy; + + ASSERT (as1[n] <= 2); + ASSERT (bs1[n] <= 2); + ASSERT (asm1[n] <= 1); + ASSERT (bsm1[n] <= 1); + ASSERT (as2[n] <= 6); + ASSERT (bs2[n] <= 6); + +#define v0 pp /* 2n */ +#define v1 (pp + 2 * n) /* 2n+1 */ +#define vinf (pp + 4 * n) /* s+t */ +#define vm1 scratch /* 2n+1 */ +#define v2 (scratch + 2 * n + 1) /* 2n+2 */ +#define scratch_out (scratch + 5 * n + 5) + + /* vm1, 2n+1 limbs */ +#ifdef SMALLER_RECURSION + TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); + cy = 0; + if (asm1[n] != 0) + cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n); + if (bsm1[n] != 0) + cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n); + vm1[2 * n] = cy; +#else + vm1[2 * n] = 0; + TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + (bsm1[n] | asm1[n]), scratch_out); +#endif + + TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */ + + /* vinf, s+t limbs */ + if (s > t) mpn_mul (vinf, a2, s, b2, t); + else TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out); + + vinf0 = vinf[0]; /* v1 overlaps with this */ + +#ifdef SMALLER_RECURSION + /* v1, 2n+1 limbs */ + TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out); + if (as1[n] == 1) + { + cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); + } + else if (as1[n] != 0) + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n); +#else + cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); +#endif + } + else + cy = 0; + if (bs1[n] == 1) + { + cy += mpn_add_n (v1 + n, v1 + n, as1, n); + } + else if (bs1[n] != 0) + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n); +#else + cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2)); +#endif + } + v1[2 * n] = cy; +#else + cy = vinf[1]; + TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out); + vinf[1] = cy; +#endif + + TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */ + + mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0); +} diff --git a/gmp-6.3.0/mpn/generic/toom3_sqr.c b/gmp-6.3.0/mpn/generic/toom3_sqr.c new file mode 100644 index 0000000..297a27f --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom3_sqr.c @@ -0,0 +1,221 @@ +/* mpn_toom3_sqr -- Square {ap,an}. + + Contributed to the GNU project by Torbjorn Granlund. + Additional improvements by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2010, 2012, 2015, 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -1, 0, +1, +2, +inf + + <-s--><--n--><--n--> + ____ ______ ______ + |_a2_|___a1_|___a0_| + + v0 = a0 ^2 # A(0)^2 + v1 = (a0+ a1+ a2)^2 # A(1)^2 ah <= 2 + vm1 = (a0- a1+ a2)^2 # A(-1)^2 |ah| <= 1 + v2 = (a0+2a1+4a2)^2 # A(2)^2 ah <= 6 + vinf= a2 ^2 # A(inf)^2 +*/ + +#if TUNE_PROGRAM_BUILD || WANT_FAT_BINARY +#define MAYBE_sqr_basecase 1 +#define MAYBE_sqr_toom3 1 +#else +#define MAYBE_sqr_basecase \ + (SQR_TOOM3_THRESHOLD < 3 * SQR_TOOM2_THRESHOLD) +#define MAYBE_sqr_toom3 \ + (SQR_TOOM4_THRESHOLD >= 3 * SQR_TOOM3_THRESHOLD) +#endif + +#define TOOM3_SQR_REC(p, a, n, ws) \ + do { \ + if (MAYBE_sqr_basecase \ + && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) \ + mpn_sqr_basecase (p, a, n); \ + else if (! MAYBE_sqr_toom3 \ + || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \ + mpn_toom2_sqr (p, a, n, ws); \ + else \ + mpn_toom3_sqr (p, a, n, ws); \ + } while (0) + +void +mpn_toom3_sqr (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_ptr scratch) +{ + const int __gmpn_cpuvec_initialized = 1; + mp_size_t n, s; + mp_limb_t cy, vinf0; + mp_ptr gp; + mp_ptr as1, asm1, as2; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2*n) + + n = (an + 2) / (size_t) 3; + + s = an - 2 * n; + + ASSERT (0 < s && s <= n); + + as1 = scratch + 4 * n + 4; + asm1 = scratch + 2 * n + 2; + as2 = pp + n + 1; + + gp = scratch; + + /* Compute as1 and asm1. */ + cy = mpn_add (gp, a0, n, a2, s); +#if HAVE_NATIVE_mpn_add_n_sub_n + if (cy == 0 && mpn_cmp (gp, a1, n) < 0) + { + cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n); + as1[n] = cy >> 1; + asm1[n] = 0; + } + else + { + mp_limb_t cy2; + cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n); + as1[n] = cy + (cy2 >> 1); + asm1[n] = cy - (cy2 & 1); + } +#else + as1[n] = cy + mpn_add_n (as1, gp, a1, n); + if (cy == 0 && mpn_cmp (gp, a1, n) < 0) + { + mpn_sub_n (asm1, a1, gp, n); + asm1[n] = 0; + } + else + { + cy -= mpn_sub_n (asm1, gp, a1, n); + asm1[n] = cy; + } +#endif + + /* Compute as2. */ +#if HAVE_NATIVE_mpn_rsblsh1_n + cy = mpn_add_n (as2, a2, as1, s); + if (s != n) + cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); + cy += as1[n]; + cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n); +#else +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (as2, a1, a2, s); + if (s != n) + cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy); + cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); +#else + cy = mpn_add_n (as2, a2, as1, s); + if (s != n) + cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy); + cy += as1[n]; + cy = 2 * cy + mpn_lshift (as2, as2, n, 1); + cy -= mpn_sub_n (as2, as2, a0, n); +#endif +#endif + as2[n] = cy; + + ASSERT (as1[n] <= 2); + ASSERT (asm1[n] <= 1); + +#define v0 pp /* 2n */ +#define v1 (pp + 2 * n) /* 2n+1 */ +#define vinf (pp + 4 * n) /* s+s */ +#define vm1 scratch /* 2n+1 */ +#define v2 (scratch + 2 * n + 1) /* 2n+2 */ +#define scratch_out (scratch + 5 * n + 5) + + /* vm1, 2n+1 limbs */ +#ifdef SMALLER_RECURSION + TOOM3_SQR_REC (vm1, asm1, n, scratch_out); + cy = asm1[n]; + if (cy != 0) + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy += mpn_addlsh1_n_ip1 (vm1 + n, asm1, n); +#else + cy += mpn_addmul_1 (vm1 + n, asm1, n, CNST_LIMB(2)); +#endif + } + vm1[2 * n] = cy; +#else + vm1[2 * n] = 0; + TOOM3_SQR_REC (vm1, asm1, n + asm1[n], scratch_out); +#endif + + TOOM3_SQR_REC (v2, as2, n + 1, scratch_out); /* v2, 2n+1 limbs */ + + TOOM3_SQR_REC (vinf, a2, s, scratch_out); /* vinf, s+s limbs */ + + vinf0 = vinf[0]; /* v1 overlaps with this */ + +#ifdef SMALLER_RECURSION + /* v1, 2n+1 limbs */ + TOOM3_SQR_REC (v1, as1, n, scratch_out); + cy = as1[n]; + if (cy == 1) + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n); +#else + cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2)); +#endif + } + else if (cy != 0) + { +#if HAVE_NATIVE_mpn_addlsh2_n_ip1 + cy = 4 + mpn_addlsh2_n_ip1 (v1 + n, as1, n); +#else + cy = 4 + mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(4)); +#endif + } + v1[2 * n] = cy; +#else + cy = vinf[1]; + TOOM3_SQR_REC (v1, as1, n + 1, scratch_out); + vinf[1] = cy; +#endif + + TOOM3_SQR_REC (v0, ap, n, scratch_out); /* v0, 2n limbs */ + + mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + s, 0, vinf0); +} diff --git a/gmp-6.3.0/mpn/generic/toom42_mul.c b/gmp-6.3.0/mpn/generic/toom42_mul.c new file mode 100644 index 0000000..e84ce65 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom42_mul.c @@ -0,0 +1,234 @@ +/* mpn_toom42_mul -- Multiply {ap,an} and {bp,bn} where an is nominally twice + as large as bn. Or more accurately, (3/2)bn < an < 4bn. + + Contributed to the GNU project by Torbjorn Granlund. + Additional improvements by Marco Bodrato. + + The idea of applying toom to unbalanced multiplication is due to Marco + Bodrato and Alberto Zanoni. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2008, 2012, 2014 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -1, 0, +1, +2, +inf + + <-s-><--n--><--n--><--n--> + ___ ______ ______ ______ + |a3_|___a2_|___a1_|___a0_| + |_b1_|___b0_| + <-t--><--n--> + + v0 = a0 * b0 # A(0)*B(0) + v1 = (a0+ a1+ a2+ a3)*(b0+ b1) # A(1)*B(1) ah <= 3 bh <= 1 + vm1 = (a0- a1+ a2- a3)*(b0- b1) # A(-1)*B(-1) |ah| <= 1 bh = 0 + v2 = (a0+2a1+4a2+8a3)*(b0+2b1) # A(2)*B(2) ah <= 14 bh <= 2 + vinf= a3 * b1 # A(inf)*B(inf) +*/ + +#define TOOM42_MUL_N_REC(p, a, b, n, ws) \ + do { \ + mpn_mul_n (p, a, b, n); \ + } while (0) + +void +mpn_toom42_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, + mp_ptr scratch) +{ + mp_size_t n, s, t; + int vm1_neg; + mp_limb_t cy, vinf0; + mp_ptr a0_a2; + mp_ptr as1, asm1, as2; + mp_ptr bs1, bsm1, bs2; + mp_ptr tmp; + TMP_DECL; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2*n) +#define a3 (ap + 3*n) +#define b0 bp +#define b1 (bp + n) + + n = an >= 2 * bn ? (an + 3) >> 2 : (bn + 1) >> 1; + + s = an - 3 * n; + t = bn - n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + + TMP_MARK; + + tmp = TMP_ALLOC_LIMBS (6 * n + 5); + as1 = tmp; tmp += n + 1; + asm1 = tmp; tmp += n + 1; + as2 = tmp; tmp += n + 1; + bs1 = tmp; tmp += n + 1; + bsm1 = tmp; tmp += n; + bs2 = tmp; tmp += n + 1; + + a0_a2 = pp; + + /* Compute as1 and asm1. */ + vm1_neg = mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0_a2) & 1; + + /* Compute as2. */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (as2, a2, a3, s); + if (s != n) + cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy); + cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n); + cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); +#else + cy = mpn_lshift (as2, a3, s, 1); + cy += mpn_add_n (as2, a2, as2, s); + if (s != n) + cy = mpn_add_1 (as2 + s, a2 + s, n - s, cy); + cy = 2 * cy + mpn_lshift (as2, as2, n, 1); + cy += mpn_add_n (as2, a1, as2, n); + cy = 2 * cy + mpn_lshift (as2, as2, n, 1); + cy += mpn_add_n (as2, a0, as2, n); +#endif + as2[n] = cy; + + /* Compute bs1 and bsm1. */ + if (t == n) + { +#if HAVE_NATIVE_mpn_add_n_sub_n + if (mpn_cmp (b0, b1, n) < 0) + { + cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n); + vm1_neg ^= 1; + } + else + { + cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n); + } + bs1[n] = cy >> 1; +#else + bs1[n] = mpn_add_n (bs1, b0, b1, n); + + if (mpn_cmp (b0, b1, n) < 0) + { + mpn_sub_n (bsm1, b1, b0, n); + vm1_neg ^= 1; + } + else + { + mpn_sub_n (bsm1, b0, b1, n); + } +#endif + } + else + { + bs1[n] = mpn_add (bs1, b0, n, b1, t); + + if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) + { + mpn_sub_n (bsm1, b1, b0, t); + MPN_ZERO (bsm1 + t, n - t); + vm1_neg ^= 1; + } + else + { + mpn_sub (bsm1, b0, n, b1, t); + } + } + + /* Compute bs2, recycling bs1. bs2=bs1+b1 */ + mpn_add (bs2, bs1, n + 1, b1, t); + + ASSERT (as1[n] <= 3); + ASSERT (bs1[n] <= 1); + ASSERT (asm1[n] <= 1); +/*ASSERT (bsm1[n] == 0);*/ + ASSERT (as2[n] <= 14); + ASSERT (bs2[n] <= 2); + +#define v0 pp /* 2n */ +#define v1 (pp + 2 * n) /* 2n+1 */ +#define vinf (pp + 4 * n) /* s+t */ +#define vm1 scratch /* 2n+1 */ +#define v2 (scratch + 2 * n + 1) /* 2n+2 */ +#define scratch_out scratch + 4 * n + 4 /* Currently unused. */ + + /* vm1, 2n+1 limbs */ + TOOM42_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out); + cy = 0; + if (asm1[n] != 0) + cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n); + vm1[2 * n] = cy; + + TOOM42_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out); /* v2, 2n+1 limbs */ + + /* vinf, s+t limbs */ + if (s > t) mpn_mul (vinf, a3, s, b1, t); + else mpn_mul (vinf, b1, t, a3, s); + + vinf0 = vinf[0]; /* v1 overlaps with this */ + + /* v1, 2n+1 limbs */ + TOOM42_MUL_N_REC (v1, as1, bs1, n, scratch_out); + if (as1[n] == 1) + { + cy = mpn_add_n (v1 + n, v1 + n, bs1, n); + } + else if (as1[n] == 2) + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy = mpn_addlsh1_n_ip1 (v1 + n, bs1, n); +#else + cy = mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); +#endif + } + else if (as1[n] == 3) + { + cy = mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(3)); + } + else + cy = 0; + if (bs1[n] != 0) + cy += as1[n] + mpn_add_n (v1 + n, v1 + n, as1, n); + v1[2 * n] = cy; + + TOOM42_MUL_N_REC (v0, ap, bp, n, scratch_out); /* v0, 2n limbs */ + + mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0); + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/toom42_mulmid.c b/gmp-6.3.0/mpn/generic/toom42_mulmid.c new file mode 100644 index 0000000..f581b10 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom42_mulmid.c @@ -0,0 +1,237 @@ +/* mpn_toom42_mulmid -- toom42 middle product + + Contributed by David Harvey. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2011 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + + +/* + Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}. + + Neither ap nor bp may overlap rp. + + Must have n >= 4. + + Amount of scratch space required is given by mpn_toom42_mulmid_itch(). + + FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact + requirements should be clarified. +*/ +void +mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, + mp_ptr scratch) +{ + mp_limb_t cy, e[12], zh, zl; + mp_size_t m; + int neg; + + ASSERT (n >= 4); + ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1)); + ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n)); + + ap += n & 1; /* handle odd row and diagonal later */ + m = n / 2; + + /* (e0h:e0l) etc are correction terms, in 2's complement */ +#define e0l (e[0]) +#define e0h (e[1]) +#define e1l (e[2]) +#define e1h (e[3]) +#define e2l (e[4]) +#define e2h (e[5]) +#define e3l (e[6]) +#define e3h (e[7]) +#define e4l (e[8]) +#define e4h (e[9]) +#define e5l (e[10]) +#define e5h (e[11]) + +#define s (scratch + 2) +#define t (rp + m + 2) +#define p0 rp +#define p1 scratch +#define p2 (rp + m) +#define next_scratch (scratch + 3*m + 1) + + /* + rp scratch + |---------|-----------| |---------|---------|----------| + 0 m 2m+2 0 m 2m 3m+1 + <----p2----> <-------------s-------------> + <----p0----><---t----> <----p1----> + */ + + /* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */ + cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0); + cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l, + bp + m, bp, m, cy); + mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy); + + /* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */ + if (mpn_cmp (bp + m, bp, m) < 0) + { + ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l, + ap + m - 1, ap + 2*m - 1, m, 0)); + neg = 1; + } + else + { + ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l, + ap + m - 1, ap + 2*m - 1, m, 0)); + neg = 0; + } + + /* recursive middle products. The picture is: + + b[2m-1] A A A B B B - - - - - + ... - A A A B B B - - - - + b[m] - - A A A B B B - - - + b[m-1] - - - C C C D D D - - + ... - - - - C C C D D D - + b[0] - - - - - C C C D D D + a[0] ... a[m] ... a[2m] ... a[4m-2] + */ + + if (m < MULMID_TOOM42_THRESHOLD) + { + /* A + B */ + mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m); + /* accumulate high limbs of p0 into e1 */ + ADDC_LIMB (cy, e1l, e1l, p0[m]); + e1h += p0[m + 1] + cy; + /* (-1)^neg * (B - C) (overwrites first m limbs of s) */ + mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m); + /* C + D (overwrites t) */ + mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m); + } + else + { + /* as above, but use toom42 instead */ + mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch); + ADDC_LIMB (cy, e1l, e1l, p0[m]); + e1h += p0[m + 1] + cy; + mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch); + mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch); + } + + /* apply error terms */ + + /* -e0 at rp[0] */ + SUBC_LIMB (cy, rp[0], rp[0], e0l); + SUBC_LIMB (cy, rp[1], rp[1], e0h + cy); + if (UNLIKELY (cy)) + { + cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1; + SUBC_LIMB (cy, e1l, e1l, cy); + e1h -= cy; + } + + /* z = e1 - e2 + high(p0) */ + SUBC_LIMB (cy, zl, e1l, e2l); + zh = e1h - e2h - cy; + + /* z at rp[m] */ + ADDC_LIMB (cy, rp[m], rp[m], zl); + zh = (zh + cy) & GMP_NUMB_MASK; + ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh); + cy -= (zh >> (GMP_NUMB_BITS - 1)); + if (UNLIKELY (cy)) + { + if (cy == 1) + mpn_add_1 (rp + m + 2, rp + m + 2, m, 1); + else /* cy == -1 */ + mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1); + } + + /* e3 at rp[2*m] */ + ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l); + rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK; + + /* e4 at p1[0] */ + ADDC_LIMB (cy, p1[0], p1[0], e4l); + ADDC_LIMB (cy, p1[1], p1[1], e4h + cy); + if (UNLIKELY (cy)) + mpn_add_1 (p1 + 2, p1 + 2, m, 1); + + /* -e5 at p1[m] */ + SUBC_LIMB (cy, p1[m], p1[m], e5l); + p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK; + + /* adjustment if p1 ends up negative */ + cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1)); + + /* add (-1)^neg * (p1 - B^m * p1) to output */ + if (neg) + { + mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy); + mpn_add (rp, rp, 2*m + 2, p1, m + 2); /* A + C */ + mpn_sub_n (rp + m, rp + m, p1, m + 2); /* B + D */ + } + else + { + mpn_add_1 (rp + m + 2, rp + m + 2, m, cy); + mpn_sub (rp, rp, 2*m + 2, p1, m + 2); /* A + C */ + mpn_add_n (rp + m, rp + m, p1, m + 2); /* B + D */ + } + + /* odd row and diagonal */ + if (n & 1) + { + /* + Products marked E are already done. We need to do products marked O. + + OOOOO---- + -EEEEO--- + --EEEEO-- + ---EEEEO- + ----EEEEO + */ + + /* first row of O's */ + cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]); + ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy); + + /* O's on diagonal */ + /* FIXME: should probably define an interface "mpn_mulmid_diag_1" + that can handle the sum below. Currently we're relying on + mulmid_basecase being pretty fast for a diagonal sum like this, + which is true at least for the K8 asm version, but surely false + for the generic version. */ + mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1); + mpn_add_n (rp + n - 1, rp + n - 1, e, 3); + } +} diff --git a/gmp-6.3.0/mpn/generic/toom43_mul.c b/gmp-6.3.0/mpn/generic/toom43_mul.c new file mode 100644 index 0000000..34acd25 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom43_mul.c @@ -0,0 +1,238 @@ +/* mpn_toom43_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3 + times as large as bn. Or more accurately, bn < an < 2 bn. + + Contributed to the GNU project by Marco Bodrato. + + The idea of applying toom to unbalanced multiplication is due to Marco + Bodrato and Alberto Zanoni. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -2, -1, 0, +1, +2, +inf + + <-s-><--n--><--n--><--n--> + ___ ______ ______ ______ + |a3_|___a2_|___a1_|___a0_| + |_b2_|___b1_|___b0_| + <-t--><--n--><--n--> + + v0 = a0 * b0 # A(0)*B(0) + v1 = (a0+ a1+ a2+ a3)*(b0+ b1+ b2) # A(1)*B(1) ah <= 3 bh <= 2 + vm1 = (a0- a1+ a2- a3)*(b0- b1+ b2) # A(-1)*B(-1) |ah| <= 1 |bh|<= 1 + v2 = (a0+2a1+4a2+8a3)*(b0+2b1+4b2) # A(2)*B(2) ah <= 14 bh <= 6 + vm2 = (a0-2a1+4a2-8a3)*(b0-2b1+4b2) # A(-2)*B(-2) |ah| <= 9 |bh|<= 4 + vinf= a3 * b2 # A(inf)*B(inf) +*/ + +void +mpn_toom43_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, mp_ptr scratch) +{ + mp_size_t n, s, t; + enum toom6_flags flags; + mp_limb_t cy; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2 * n) +#define a3 (ap + 3 * n) +#define b0 bp +#define b1 (bp + n) +#define b2 (bp + 2 * n) + + n = 1 + (3 * an >= 4 * bn ? (an - 1) >> 2 : (bn - 1) / (size_t) 3); + + s = an - 3 * n; + t = bn - 2 * n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + + /* This is true whenever an >= 25 or bn >= 19, I think. It + guarantees that we can fit 5 values of size n+1 in the product + area. */ + ASSERT (s+t >= 5); + +#define v0 pp /* 2n */ +#define vm1 (scratch) /* 2n+1 */ +#define v1 (pp + 2*n) /* 2n+1 */ +#define vm2 (scratch + 2 * n + 1) /* 2n+1 */ +#define v2 (scratch + 4 * n + 2) /* 2n+1 */ +#define vinf (pp + 5 * n) /* s+t */ +#define bs1 pp /* n+1 */ +#define bsm1 (scratch + 2 * n + 2) /* n+1 */ +#define asm1 (scratch + 3 * n + 3) /* n+1 */ +#define asm2 (scratch + 4 * n + 4) /* n+1 */ +#define bsm2 (pp + n + 1) /* n+1 */ +#define bs2 (pp + 2 * n + 2) /* n+1 */ +#define as2 (pp + 3 * n + 3) /* n+1 */ +#define as1 (pp + 4 * n + 4) /* n+1 */ + + /* Total sccratch need is 6 * n + 3 + 1; we allocate one extra + limb, because products will overwrite 2n+2 limbs. */ + +#define a0a2 scratch +#define b0b2 scratch +#define a1a3 asm1 +#define b1d bsm1 + + /* Compute as2 and asm2. */ + flags = (enum toom6_flags) (toom6_vm2_neg & mpn_toom_eval_dgr3_pm2 (as2, asm2, ap, n, s, a1a3)); + + /* Compute bs2 and bsm2. */ + b1d[n] = mpn_lshift (b1d, b1, n, 1); /* 2b1 */ +#if HAVE_NATIVE_mpn_addlsh2_n + cy = mpn_addlsh2_n (b0b2, b0, b2, t); /* 4b2 + b0 */ +#else + cy = mpn_lshift (b0b2, b2, t, 2); /* 4b2 */ + cy += mpn_add_n (b0b2, b0b2, b0, t); /* 4b2 + b0 */ +#endif + if (t != n) + cy = mpn_add_1 (b0b2 + t, b0 + t, n - t, cy); + b0b2[n] = cy; + +#if HAVE_NATIVE_mpn_add_n_sub_n + if (mpn_cmp (b0b2, b1d, n+1) < 0) + { + mpn_add_n_sub_n (bs2, bsm2, b1d, b0b2, n+1); + flags = (enum toom6_flags) (flags ^ toom6_vm2_neg); + } + else + { + mpn_add_n_sub_n (bs2, bsm2, b0b2, b1d, n+1); + } +#else + mpn_add_n (bs2, b0b2, b1d, n+1); + if (mpn_cmp (b0b2, b1d, n+1) < 0) + { + mpn_sub_n (bsm2, b1d, b0b2, n+1); + flags = (enum toom6_flags) (flags ^ toom6_vm2_neg); + } + else + { + mpn_sub_n (bsm2, b0b2, b1d, n+1); + } +#endif + + /* Compute as1 and asm1. */ + flags = (enum toom6_flags) (flags ^ (toom6_vm1_neg & mpn_toom_eval_dgr3_pm1 (as1, asm1, ap, n, s, a0a2))); + + /* Compute bs1 and bsm1. */ + bsm1[n] = mpn_add (bsm1, b0, n, b2, t); +#if HAVE_NATIVE_mpn_add_n_sub_n + if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0) + { + cy = mpn_add_n_sub_n (bs1, bsm1, b1, bsm1, n); + bs1[n] = cy >> 1; + flags = (enum toom6_flags) (flags ^ toom6_vm1_neg); + } + else + { + cy = mpn_add_n_sub_n (bs1, bsm1, bsm1, b1, n); + bs1[n] = bsm1[n] + (cy >> 1); + bsm1[n]-= cy & 1; + } +#else + bs1[n] = bsm1[n] + mpn_add_n (bs1, bsm1, b1, n); + if (bsm1[n] == 0 && mpn_cmp (bsm1, b1, n) < 0) + { + mpn_sub_n (bsm1, b1, bsm1, n); + flags = (enum toom6_flags) (flags ^ toom6_vm1_neg); + } + else + { + bsm1[n] -= mpn_sub_n (bsm1, bsm1, b1, n); + } +#endif + + ASSERT (as1[n] <= 3); + ASSERT (bs1[n] <= 2); + ASSERT (asm1[n] <= 1); + ASSERT (bsm1[n] <= 1); + ASSERT (as2[n] <=14); + ASSERT (bs2[n] <= 6); + ASSERT (asm2[n] <= 9); + ASSERT (bsm2[n] <= 4); + + /* vm1, 2n+1 limbs */ + vm1[2*n] = 0; + mpn_mul_n (vm1, asm1, bsm1, n + (asm1[n] | bsm1[n])); /* W4 */ + + /* vm2, 2n+1 limbs */ + mpn_mul_n (vm2, asm2, bsm2, n+1); /* W2 */ + + /* v2, 2n+1 limbs */ + mpn_mul_n (v2, as2, bs2, n+1); /* W1 */ + + /* v1, 2n+1 limbs */ + mpn_mul_n (v1, as1, bs1, n+1); /* W3 */ + + /* vinf, s+t limbs */ /* W0 */ + if (s > t) mpn_mul (vinf, a3, s, b2, t); + else mpn_mul (vinf, b2, t, a3, s); + + /* v0, 2n limbs */ + mpn_mul_n (v0, ap, bp, n); /* W5 */ + + mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s); + +#undef v0 +#undef vm1 +#undef v1 +#undef vm2 +#undef v2 +#undef vinf +#undef bs1 +#undef bs2 +#undef bsm1 +#undef bsm2 +#undef asm1 +#undef asm2 +/* #undef as1 */ +/* #undef as2 */ +#undef a0a2 +#undef b0b2 +#undef a1a3 +#undef b1d +#undef a0 +#undef a1 +#undef a2 +#undef a3 +#undef b0 +#undef b1 +#undef b2 +} diff --git a/gmp-6.3.0/mpn/generic/toom44_mul.c b/gmp-6.3.0/mpn/generic/toom44_mul.c new file mode 100644 index 0000000..a361899 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom44_mul.c @@ -0,0 +1,239 @@ +/* mpn_toom44_mul -- Multiply {ap,an} and {bp,bn} where an and bn are close in + size. Or more accurately, bn <= an < (4/3)bn. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2008, 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf + + <-s--><--n--><--n--><--n--> + ____ ______ ______ ______ + |_a3_|___a2_|___a1_|___a0_| + |b3_|___b2_|___b1_|___b0_| + <-t-><--n--><--n--><--n--> + + v0 = a0 * b0 # A(0)*B(0) + v1 = ( a0+ a1+ a2+ a3)*( b0+ b1+ b2+ b3) # A(1)*B(1) ah <= 3 bh <= 3 + vm1 = ( a0- a1+ a2- a3)*( b0- b1+ b2- b3) # A(-1)*B(-1) |ah| <= 1 |bh| <= 1 + v2 = ( a0+2a1+4a2+8a3)*( b0+2b1+4b2+8b3) # A(2)*B(2) ah <= 14 bh <= 14 + vm2 = ( a0-2a1+4a2-8a3)*( b0-2b1+4b2-8b3) # A(-2)*B(-2) |ah| <= 9 |bh| <= 9 + vh = (8a0+4a1+2a2+ a3)*(8b0+4b1+2b2+ b3) # A(1/2)*B(1/2) ah <= 14 bh <= 14 + vinf= a3 * b2 # A(inf)*B(inf) +*/ + +#if TUNE_PROGRAM_BUILD +#define MAYBE_mul_basecase 1 +#define MAYBE_mul_toom22 1 +#define MAYBE_mul_toom44 1 +#else +#define MAYBE_mul_basecase \ + (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM22_THRESHOLD) +#define MAYBE_mul_toom22 \ + (MUL_TOOM44_THRESHOLD < 4 * MUL_TOOM33_THRESHOLD) +#define MAYBE_mul_toom44 \ + (MUL_TOOM6H_THRESHOLD >= 4 * MUL_TOOM44_THRESHOLD) +#endif + +#define TOOM44_MUL_N_REC(p, a, b, n, ws) \ + do { \ + if (MAYBE_mul_basecase \ + && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) \ + mpn_mul_basecase (p, a, n, b, n); \ + else if (MAYBE_mul_toom22 \ + && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) \ + mpn_toom22_mul (p, a, n, b, n, ws); \ + else if (! MAYBE_mul_toom44 \ + || BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) \ + mpn_toom33_mul (p, a, n, b, n, ws); \ + else \ + mpn_toom44_mul (p, a, n, b, n, ws); \ + } while (0) + +/* Use of scratch space. In the product area, we store + + ___________________ + |vinf|____|_v1_|_v0_| + s+t 2n-1 2n+1 2n + + The other recursive products, vm1, v2, vm2, vh are stored in the + scratch area. When computing them, we use the product area for + intermediate values. + + Next, we compute v1. We can store the intermediate factors at v0 + and at vh + 2n + 2. + + Finally, for v0 and vinf, factors are parts of the input operands, + and we need scratch space only for the recursive multiplication. + + In all, if S(an) is the scratch need, the needed space is bounded by + + S(an) <= 4 (2*ceil(an/4) + 1) + 1 + S(ceil(an/4) + 1) + + which should give S(n) = 8 n/3 + c log(n) for some constant c. +*/ + +void +mpn_toom44_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, + mp_ptr scratch) +{ + mp_size_t n, s, t; + mp_limb_t cy; + enum toom7_flags flags; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2*n) +#define a3 (ap + 3*n) +#define b0 bp +#define b1 (bp + n) +#define b2 (bp + 2*n) +#define b3 (bp + 3*n) + + ASSERT (an >= bn); + + n = (an + 3) >> 2; + + s = an - 3 * n; + t = bn - 3 * n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + ASSERT (s >= t); + + /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the + * following limb, so these must be computed in order, and we need a + * one limb gap to tp. */ +#define v0 pp /* 2n */ +#define v1 (pp + 2 * n) /* 2n+1 */ +#define vinf (pp + 6 * n) /* s+t */ +#define v2 scratch /* 2n+1 */ +#define vm2 (scratch + 2 * n + 1) /* 2n+1 */ +#define vh (scratch + 4 * n + 2) /* 2n+1 */ +#define vm1 (scratch + 6 * n + 3) /* 2n+1 */ +#define tp (scratch + 8*n + 5) + + /* apx and bpx must not overlap with v1 */ +#define apx pp /* n+1 */ +#define amx (pp + n + 1) /* n+1 */ +#define bmx (pp + 2*n + 2) /* n+1 */ +#define bpx (pp + 4*n + 2) /* n+1 */ + + /* Total scratch need: 8*n + 5 + scratch for recursive calls. This + gives roughly 32 n/3 + log term. */ + + /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3. */ + flags = (enum toom7_flags) (toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp)); + + /* Compute bpx = b0 + 2 b1 + 4 b2 + 8 b3 and bmx = b0 - 2 b1 + 4 b2 - 8 b3. */ + flags = (enum toom7_flags) (flags ^ (toom7_w1_neg & mpn_toom_eval_dgr3_pm2 (bpx, bmx, bp, n, t, tp))); + + TOOM44_MUL_N_REC (v2, apx, bpx, n + 1, tp); /* v2, 2n+1 limbs */ + TOOM44_MUL_N_REC (vm2, amx, bmx, n + 1, tp); /* vm2, 2n+1 limbs */ + + /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (apx, a1, a0, n); + cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n); + if (s < n) + { + mp_limb_t cy2; + cy2 = mpn_addlsh1_n (apx, a3, apx, s); + apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1); + MPN_INCR_U (apx + s, n+1-s, cy2); + } + else + apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n); +#else + cy = mpn_lshift (apx, a0, n, 1); + cy += mpn_add_n (apx, apx, a1, n); + cy = 2*cy + mpn_lshift (apx, apx, n, 1); + cy += mpn_add_n (apx, apx, a2, n); + cy = 2*cy + mpn_lshift (apx, apx, n, 1); + apx[n] = cy + mpn_add (apx, apx, n, a3, s); +#endif + + /* Compute bpx = 8 b0 + 4 b1 + 2 b2 + b3 = (((2*b0 + b1) * 2 + b2) * 2 + b3 */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (bpx, b1, b0, n); + cy = 2*cy + mpn_addlsh1_n (bpx, b2, bpx, n); + if (t < n) + { + mp_limb_t cy2; + cy2 = mpn_addlsh1_n (bpx, b3, bpx, t); + bpx[n] = 2*cy + mpn_lshift (bpx + t, bpx + t, n - t, 1); + MPN_INCR_U (bpx + t, n+1-t, cy2); + } + else + bpx[n] = 2*cy + mpn_addlsh1_n (bpx, b3, bpx, n); +#else + cy = mpn_lshift (bpx, b0, n, 1); + cy += mpn_add_n (bpx, bpx, b1, n); + cy = 2*cy + mpn_lshift (bpx, bpx, n, 1); + cy += mpn_add_n (bpx, bpx, b2, n); + cy = 2*cy + mpn_lshift (bpx, bpx, n, 1); + bpx[n] = cy + mpn_add (bpx, bpx, n, b3, t); +#endif + + ASSERT (apx[n] < 15); + ASSERT (bpx[n] < 15); + + TOOM44_MUL_N_REC (vh, apx, bpx, n + 1, tp); /* vh, 2n+1 limbs */ + + /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3. */ + flags = (enum toom7_flags) (flags | (toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp))); + + /* Compute bpx = b0 + b1 + b2 + b3 and bmx = b0 - b1 + b2 - b3. */ + flags = (enum toom7_flags) (flags ^ (toom7_w3_neg & mpn_toom_eval_dgr3_pm1 (bpx, bmx, bp, n, t, tp))); + + ASSERT (amx[n] <= 1); + ASSERT (bmx[n] <= 1); + + vm1 [2 * n] = 0; + TOOM44_MUL_N_REC (vm1, amx, bmx, n + (bmx[n] | amx[n]), tp); /* vm1, 2n+1 limbs */ + /* Clobbers amx, bmx. */ + TOOM44_MUL_N_REC (v1, apx, bpx, n + 1, tp); /* v1, 2n+1 limbs */ + + TOOM44_MUL_N_REC (v0, a0, b0, n, tp); + if (s > t) + mpn_mul (vinf, a3, s, b3, t); + else + TOOM44_MUL_N_REC (vinf, a3, b3, s, tp); /* vinf, s+t limbs */ + + mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, tp); +} diff --git a/gmp-6.3.0/mpn/generic/toom4_sqr.c b/gmp-6.3.0/mpn/generic/toom4_sqr.c new file mode 100644 index 0000000..fd59d1c --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom4_sqr.c @@ -0,0 +1,164 @@ +/* mpn_toom4_sqr -- Square {ap,an}. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2010, 2013, 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -2, -1, 0, +1/2, +1, +2, +inf + + <-s--><--n--><--n--><--n--> + ____ ______ ______ ______ + |_a3_|___a2_|___a1_|___a0_| + + v0 = a0 ^2 # A(0)^2 + v1 = ( a0+ a1+ a2+ a3)^2 # A(1)^2 ah <= 3 + vm1 = ( a0- a1+ a2- a3)^2 # A(-1)^2 |ah| <= 1 + v2 = ( a0+2a1+4a2+8a3)^2 # A(2)^2 ah <= 14 + vm2 = ( a0-2a1+4a2-8a3)^2 # A(-2)^2 -9<=ah<=4 + vh = (8a0+4a1+2a2+ a3)^2 # A(1/2)^2 ah <= 14 + vinf= a3 ^2 # A(inf)^2 +*/ + +#if TUNE_PROGRAM_BUILD +#define MAYBE_sqr_basecase 1 +#define MAYBE_sqr_toom2 1 +#define MAYBE_sqr_toom4 1 +#else +#define MAYBE_sqr_basecase \ + (SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM2_THRESHOLD) +#define MAYBE_sqr_toom2 \ + (SQR_TOOM4_THRESHOLD < 4 * SQR_TOOM3_THRESHOLD) +#define MAYBE_sqr_toom4 \ + (SQR_TOOM6_THRESHOLD >= 4 * SQR_TOOM4_THRESHOLD) +#endif + +#define TOOM4_SQR_REC(p, a, n, ws) \ + do { \ + if (MAYBE_sqr_basecase \ + && BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD)) \ + mpn_sqr_basecase (p, a, n); \ + else if (MAYBE_sqr_toom2 \ + && BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD)) \ + mpn_toom2_sqr (p, a, n, ws); \ + else if (! MAYBE_sqr_toom4 \ + || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD)) \ + mpn_toom3_sqr (p, a, n, ws); \ + else \ + mpn_toom4_sqr (p, a, n, ws); \ + } while (0) + +void +mpn_toom4_sqr (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_ptr scratch) +{ + mp_size_t n, s; + mp_limb_t cy; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2*n) +#define a3 (ap + 3*n) + + n = (an + 3) >> 2; + + s = an - 3 * n; + + ASSERT (0 < s && s <= n); + + /* NOTE: The multiplications to v2, vm2, vh and vm1 overwrites the + * following limb, so these must be computed in order, and we need a + * one limb gap to tp. */ +#define v0 pp /* 2n */ +#define v1 (pp + 2 * n) /* 2n+1 */ +#define vinf (pp + 6 * n) /* s+t */ +#define v2 scratch /* 2n+1 */ +#define vm2 (scratch + 2 * n + 1) /* 2n+1 */ +#define vh (scratch + 4 * n + 2) /* 2n+1 */ +#define vm1 (scratch + 6 * n + 3) /* 2n+1 */ +#define tp (scratch + 8*n + 5) + + /* No overlap with v1 */ +#define apx pp /* n+1 */ +#define amx (pp + 4*n + 2) /* n+1 */ + + /* Total scratch need: 8*n + 5 + scratch for recursive calls. This + gives roughly 32 n/3 + log term. */ + + /* Compute apx = a0 + 2 a1 + 4 a2 + 8 a3 and amx = a0 - 2 a1 + 4 a2 - 8 a3. */ + mpn_toom_eval_dgr3_pm2 (apx, amx, ap, n, s, tp); + + TOOM4_SQR_REC (v2, apx, n + 1, tp); /* v2, 2n+1 limbs */ + TOOM4_SQR_REC (vm2, amx, n + 1, tp); /* vm2, 2n+1 limbs */ + + /* Compute apx = 8 a0 + 4 a1 + 2 a2 + a3 = (((2*a0 + a1) * 2 + a2) * 2 + a3 */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (apx, a1, a0, n); + cy = 2*cy + mpn_addlsh1_n (apx, a2, apx, n); + if (s < n) + { + mp_limb_t cy2; + cy2 = mpn_addlsh1_n (apx, a3, apx, s); + apx[n] = 2*cy + mpn_lshift (apx + s, apx + s, n - s, 1); + MPN_INCR_U (apx + s, n+1-s, cy2); + } + else + apx[n] = 2*cy + mpn_addlsh1_n (apx, a3, apx, n); +#else + cy = mpn_lshift (apx, a0, n, 1); + cy += mpn_add_n (apx, apx, a1, n); + cy = 2*cy + mpn_lshift (apx, apx, n, 1); + cy += mpn_add_n (apx, apx, a2, n); + cy = 2*cy + mpn_lshift (apx, apx, n, 1); + apx[n] = cy + mpn_add (apx, apx, n, a3, s); +#endif + + ASSERT (apx[n] < 15); + + TOOM4_SQR_REC (vh, apx, n + 1, tp); /* vh, 2n+1 limbs */ + + /* Compute apx = a0 + a1 + a2 + a3 and amx = a0 - a1 + a2 - a3. */ + mpn_toom_eval_dgr3_pm1 (apx, amx, ap, n, s, tp); + + TOOM4_SQR_REC (v1, apx, n + 1, tp); /* v1, 2n+1 limbs */ + vm1 [2 * n] = 0; + TOOM4_SQR_REC (vm1, amx, n + amx[n], tp); /* vm1, 2n+1 limbs */ + + TOOM4_SQR_REC (v0, a0, n, tp); + TOOM4_SQR_REC (vinf, a3, s, tp); /* vinf, 2s limbs */ + + mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) 0, vm2, vm1, v2, vh, 2*s, tp); +} diff --git a/gmp-6.3.0/mpn/generic/toom52_mul.c b/gmp-6.3.0/mpn/generic/toom52_mul.c new file mode 100644 index 0000000..974059b --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom52_mul.c @@ -0,0 +1,256 @@ +/* mpn_toom52_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 4/3 + times as large as bn. Or more accurately, bn < an < 2 bn. + + Contributed to the GNU project by Marco Bodrato. + + The idea of applying toom to unbalanced multiplication is due to Marco + Bodrato and Alberto Zanoni. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: -2, -1, 0, +1, +2, +inf + + <-s-><--n--><--n--><--n--><--n--> + ___ ______ ______ ______ ______ + |a4_|___a3_|___a2_|___a1_|___a0_| + |b1|___b0_| + <--n--> + + v0 = a0 * b0 # A(0)*B(0) + v1 = (a0+ a1+ a2+ a3+ a4)*(b0+ b1) # A(1)*B(1) ah <= 4 bh <= 1 + vm1 = (a0- a1+ a2- a3+ a4)*(b0- b1) # A(-1)*B(-1) |ah| <= 2 bh = 0 + v2 = (a0+2a1+4a2+8a3+16a4)*(b0+2b1) # A(2)*B(2) ah <= 30 bh <= 2 + vm2 = (a0-2a1+4a2-8a3+16a4)*(b0-2b1) # A(-2)*B(-2) |ah| <= 20 |bh|<= 1 + vinf= a4 * b1 # A(inf)*B(inf) + + Some slight optimization in evaluation are taken from the paper: + "Towards Optimal Toom-Cook Multiplication for Univariate and + Multivariate Polynomials in Characteristic 2 and 0." +*/ + +void +mpn_toom52_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, mp_ptr scratch) +{ + mp_size_t n, s, t; + enum toom6_flags flags; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2 * n) +#define a3 (ap + 3 * n) +#define a4 (ap + 4 * n) +#define b0 bp +#define b1 (bp + n) + + n = 1 + (2 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) >> 1); + + s = an - 4 * n; + t = bn - n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + + /* Ensures that 5 values of n+1 limbs each fits in the product area. + Borderline cases are an = 32, bn = 8, n = 7, and an = 36, bn = 9, + n = 8. */ + ASSERT (s+t >= 5); + +#define v0 pp /* 2n */ +#define vm1 (scratch) /* 2n+1 */ +#define v1 (pp + 2 * n) /* 2n+1 */ +#define vm2 (scratch + 2 * n + 1) /* 2n+1 */ +#define v2 (scratch + 4 * n + 2) /* 2n+1 */ +#define vinf (pp + 5 * n) /* s+t */ +#define bs1 pp /* n+1 */ +#define bsm1 (scratch + 2 * n + 2) /* n */ +#define asm1 (scratch + 3 * n + 3) /* n+1 */ +#define asm2 (scratch + 4 * n + 4) /* n+1 */ +#define bsm2 (pp + n + 1) /* n+1 */ +#define bs2 (pp + 2 * n + 2) /* n+1 */ +#define as2 (pp + 3 * n + 3) /* n+1 */ +#define as1 (pp + 4 * n + 4) /* n+1 */ + + /* Scratch need is 6 * n + 3 + 1. We need one extra limb, because + products will overwrite 2n+2 limbs. */ + +#define a0a2 scratch +#define a1a3 asm1 + + /* Compute as2 and asm2. */ + flags = (enum toom6_flags) (toom6_vm2_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, a1a3)); + + /* Compute bs1 and bsm1. */ + if (t == n) + { +#if HAVE_NATIVE_mpn_add_n_sub_n + mp_limb_t cy; + + if (mpn_cmp (b0, b1, n) < 0) + { + cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n); + flags = (enum toom6_flags) (flags ^ toom6_vm1_neg); + } + else + { + cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n); + } + bs1[n] = cy >> 1; +#else + bs1[n] = mpn_add_n (bs1, b0, b1, n); + if (mpn_cmp (b0, b1, n) < 0) + { + mpn_sub_n (bsm1, b1, b0, n); + flags = (enum toom6_flags) (flags ^ toom6_vm1_neg); + } + else + { + mpn_sub_n (bsm1, b0, b1, n); + } +#endif + } + else + { + bs1[n] = mpn_add (bs1, b0, n, b1, t); + if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) + { + mpn_sub_n (bsm1, b1, b0, t); + MPN_ZERO (bsm1 + t, n - t); + flags = (enum toom6_flags) (flags ^ toom6_vm1_neg); + } + else + { + mpn_sub (bsm1, b0, n, b1, t); + } + } + + /* Compute bs2 and bsm2, recycling bs1 and bsm1. bs2=bs1+b1; bsm2=bsm1-b1 */ + mpn_add (bs2, bs1, n+1, b1, t); + if (flags & toom6_vm1_neg) + { + bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t); + flags = (enum toom6_flags) (flags ^ toom6_vm2_neg); + } + else + { + bsm2[n] = 0; + if (t == n) + { + if (mpn_cmp (bsm1, b1, n) < 0) + { + mpn_sub_n (bsm2, b1, bsm1, n); + flags = (enum toom6_flags) (flags ^ toom6_vm2_neg); + } + else + { + mpn_sub_n (bsm2, bsm1, b1, n); + } + } + else + { + if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0) + { + mpn_sub_n (bsm2, b1, bsm1, t); + MPN_ZERO (bsm2 + t, n - t); + flags = (enum toom6_flags) (flags ^ toom6_vm2_neg); + } + else + { + mpn_sub (bsm2, bsm1, n, b1, t); + } + } + } + + /* Compute as1 and asm1. */ + flags = (enum toom6_flags) (flags ^ (toom6_vm1_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, a0a2))); + + ASSERT (as1[n] <= 4); + ASSERT (bs1[n] <= 1); + ASSERT (asm1[n] <= 2); +/* ASSERT (bsm1[n] <= 1); */ + ASSERT (as2[n] <=30); + ASSERT (bs2[n] <= 2); + ASSERT (asm2[n] <= 20); + ASSERT (bsm2[n] <= 1); + + /* vm1, 2n+1 limbs */ + mpn_mul (vm1, asm1, n+1, bsm1, n); /* W4 */ + + /* vm2, 2n+1 limbs */ + mpn_mul_n (vm2, asm2, bsm2, n+1); /* W2 */ + + /* v2, 2n+1 limbs */ + mpn_mul_n (v2, as2, bs2, n+1); /* W1 */ + + /* v1, 2n+1 limbs */ + mpn_mul_n (v1, as1, bs1, n+1); /* W3 */ + + /* vinf, s+t limbs */ /* W0 */ + if (s > t) mpn_mul (vinf, a4, s, b1, t); + else mpn_mul (vinf, b1, t, a4, s); + + /* v0, 2n limbs */ + mpn_mul_n (v0, ap, bp, n); /* W5 */ + + mpn_toom_interpolate_6pts (pp, n, flags, vm1, vm2, v2, t + s); + +#undef v0 +#undef vm1 +#undef v1 +#undef vm2 +#undef v2 +#undef vinf +#undef bs1 +#undef bs2 +#undef bsm1 +#undef bsm2 +#undef asm1 +#undef asm2 +#undef as1 +#undef as2 +#undef a0a2 +#undef b0b2 +#undef a1a3 +#undef a0 +#undef a1 +#undef a2 +#undef a3 +#undef b0 +#undef b1 +#undef b2 + +} diff --git a/gmp-6.3.0/mpn/generic/toom53_mul.c b/gmp-6.3.0/mpn/generic/toom53_mul.c new file mode 100644 index 0000000..c934297 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom53_mul.c @@ -0,0 +1,331 @@ +/* mpn_toom53_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 5/3 + times as large as bn. Or more accurately, (4/3)bn < an < (5/2)bn. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + The idea of applying toom to unbalanced multiplication is due to Marco + Bodrato and Alberto Zanoni. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2008, 2012, 2014, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: 0, +1, -1, +2, -2, 1/2, +inf + + <-s-><--n--><--n--><--n--><--n--> + ___ ______ ______ ______ ______ + |a4_|___a3_|___a2_|___a1_|___a0_| + |__b2|___b1_|___b0_| + <-t--><--n--><--n--> + + v0 = a0 * b0 # A(0)*B(0) + v1 = ( a0+ a1+ a2+ a3+ a4)*( b0+ b1+ b2) # A(1)*B(1) ah <= 4 bh <= 2 + vm1 = ( a0- a1+ a2- a3+ a4)*( b0- b1+ b2) # A(-1)*B(-1) |ah| <= 2 bh <= 1 + v2 = ( a0+2a1+4a2+8a3+16a4)*( b0+2b1+4b2) # A(2)*B(2) ah <= 30 bh <= 6 + vm2 = ( a0-2a1+4a2-8a3+16a4)*( b0-2b1+4b2) # A(2)*B(2) -9<=ah<=20 -1<=bh<=4 + vh = (16a0+8a1+4a2+2a3+ a4)*(4b0+2b1+ b2) # A(1/2)*B(1/2) ah <= 30 bh <= 6 + vinf= a4 * b2 # A(inf)*B(inf) +*/ + +void +mpn_toom53_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, + mp_ptr scratch) +{ + mp_size_t n, s, t; + mp_limb_t cy; + mp_ptr gp; + mp_ptr as1, asm1, as2, asm2, ash; + mp_ptr bs1, bsm1, bs2, bsm2, bsh; + mp_ptr tmp; + enum toom7_flags flags; + TMP_DECL; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2*n) +#define a3 (ap + 3*n) +#define a4 (ap + 4*n) +#define b0 bp +#define b1 (bp + n) +#define b2 (bp + 2*n) + + n = 1 + (3 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 3); + + s = an - 4 * n; + t = bn - 2 * n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + + TMP_MARK; + + tmp = TMP_ALLOC_LIMBS (10 * (n + 1)); + as1 = tmp; tmp += n + 1; + asm1 = tmp; tmp += n + 1; + as2 = tmp; tmp += n + 1; + asm2 = tmp; tmp += n + 1; + ash = tmp; tmp += n + 1; + bs1 = tmp; tmp += n + 1; + bsm1 = tmp; tmp += n + 1; + bs2 = tmp; tmp += n + 1; + bsm2 = tmp; tmp += n + 1; + bsh = tmp; tmp += n + 1; + + gp = pp; + + /* Compute as1 and asm1. */ + flags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 4, ap, n, s, gp)); + + /* Compute as2 and asm2. */ + flags = (enum toom7_flags) (flags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 4, ap, n, s, gp))); + + /* Compute ash = 16 a0 + 8 a1 + 4 a2 + 2 a3 + a4 + = 2*(2*(2*(2*a0 + a1) + a2) + a3) + a4 */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (ash, a1, a0, n); + cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n); + cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n); + if (s < n) + { + mp_limb_t cy2; + cy2 = mpn_addlsh1_n (ash, a4, ash, s); + ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1); + MPN_INCR_U (ash + s, n+1-s, cy2); + } + else + ash[n] = 2*cy + mpn_addlsh1_n (ash, a4, ash, n); +#else + cy = mpn_lshift (ash, a0, n, 1); + cy += mpn_add_n (ash, ash, a1, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + cy += mpn_add_n (ash, ash, a2, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + cy += mpn_add_n (ash, ash, a3, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + ash[n] = cy + mpn_add (ash, ash, n, a4, s); +#endif + + /* Compute bs1 and bsm1. */ + bs1[n] = mpn_add (bs1, b0, n, b2, t); /* b0 + b2 */ +#if HAVE_NATIVE_mpn_add_n_sub_n + if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0) + { + bs1[n] = mpn_add_n_sub_n (bs1, bsm1, b1, bs1, n) >> 1; + bsm1[n] = 0; + flags = (enum toom7_flags) (flags ^ toom7_w3_neg); + } + else + { + cy = mpn_add_n_sub_n (bs1, bsm1, bs1, b1, n); + bsm1[n] = bs1[n] - (cy & 1); + bs1[n] += (cy >> 1); + } +#else + if (bs1[n] == 0 && mpn_cmp (bs1, b1, n) < 0) + { + mpn_sub_n (bsm1, b1, bs1, n); + bsm1[n] = 0; + flags = (enum toom7_flags) (flags ^ toom7_w3_neg); + } + else + { + bsm1[n] = bs1[n] - mpn_sub_n (bsm1, bs1, b1, n); + } + bs1[n] += mpn_add_n (bs1, bs1, b1, n); /* b0+b1+b2 */ +#endif + + /* Compute bs2 and bsm2. */ +#if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n +#if HAVE_NATIVE_mpn_addlsh2_n + cy = mpn_addlsh2_n (bs2, b0, b2, t); +#else /* HAVE_NATIVE_mpn_addlsh_n */ + cy = mpn_addlsh_n (bs2, b0, b2, t, 2); +#endif + if (t < n) + cy = mpn_add_1 (bs2 + t, b0 + t, n - t, cy); + bs2[n] = cy; +#else + cy = mpn_lshift (gp, b2, t, 2); + bs2[n] = mpn_add (bs2, b0, n, gp, t); + MPN_INCR_U (bs2 + t, n+1-t, cy); +#endif + + gp[n] = mpn_lshift (gp, b1, n, 1); + +#if HAVE_NATIVE_mpn_add_n_sub_n + if (mpn_cmp (bs2, gp, n+1) < 0) + { + ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, gp, bs2, n+1)); + flags = (enum toom7_flags) (flags ^ toom7_w1_neg); + } + else + { + ASSERT_NOCARRY (mpn_add_n_sub_n (bs2, bsm2, bs2, gp, n+1)); + } +#else + if (mpn_cmp (bs2, gp, n+1) < 0) + { + ASSERT_NOCARRY (mpn_sub_n (bsm2, gp, bs2, n+1)); + flags = (enum toom7_flags) (flags ^ toom7_w1_neg); + } + else + { + ASSERT_NOCARRY (mpn_sub_n (bsm2, bs2, gp, n+1)); + } + mpn_add_n (bs2, bs2, gp, n+1); +#endif + + /* Compute bsh = 4 b0 + 2 b1 + b2 = 2*(2*b0 + b1)+b2. */ +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (bsh, b1, b0, n); + if (t < n) + { + mp_limb_t cy2; + cy2 = mpn_addlsh1_n (bsh, b2, bsh, t); + bsh[n] = 2*cy + mpn_lshift (bsh + t, bsh + t, n - t, 1); + MPN_INCR_U (bsh + t, n+1-t, cy2); + } + else + bsh[n] = 2*cy + mpn_addlsh1_n (bsh, b2, bsh, n); +#else + cy = mpn_lshift (bsh, b0, n, 1); + cy += mpn_add_n (bsh, bsh, b1, n); + cy = 2*cy + mpn_lshift (bsh, bsh, n, 1); + bsh[n] = cy + mpn_add (bsh, bsh, n, b2, t); +#endif + + ASSERT (as1[n] <= 4); + ASSERT (bs1[n] <= 2); + ASSERT (asm1[n] <= 2); + ASSERT (bsm1[n] <= 1); + ASSERT (as2[n] <= 30); + ASSERT (bs2[n] <= 6); + ASSERT (asm2[n] <= 20); + ASSERT (bsm2[n] <= 4); + ASSERT (ash[n] <= 30); + ASSERT (bsh[n] <= 6); + +#define v0 pp /* 2n */ +#define v1 (pp + 2 * n) /* 2n+1 */ +#define vinf (pp + 6 * n) /* s+t */ +#define v2 scratch /* 2n+1 */ +#define vm2 (scratch + 2 * n + 1) /* 2n+1 */ +#define vh (scratch + 4 * n + 2) /* 2n+1 */ +#define vm1 (scratch + 6 * n + 3) /* 2n+1 */ +#define scratch_out (scratch + 8 * n + 4) /* 2n+1 */ + /* Total scratch need: 10*n+5 */ + + /* Must be in allocation order, as they overwrite one limb beyond + * 2n+1. */ + mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */ + mpn_mul_n (vm2, asm2, bsm2, n + 1); /* vm2, 2n+1 limbs */ + mpn_mul_n (vh, ash, bsh, n + 1); /* vh, 2n+1 limbs */ + + /* vm1, 2n+1 limbs */ +#ifdef SMALLER_RECURSION + mpn_mul_n (vm1, asm1, bsm1, n); + if (asm1[n] == 1) + { + cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n); + } + else if (asm1[n] == 2) + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy = 2 * bsm1[n] + mpn_addlsh1_n_ip1 (vm1 + n, bsm1, n); +#else + cy = 2 * bsm1[n] + mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2)); +#endif + } + else + cy = 0; + if (bsm1[n] != 0) + cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n); + vm1[2 * n] = cy; +#else /* SMALLER_RECURSION */ + vm1[2 * n] = 0; + mpn_mul_n (vm1, asm1, bsm1, n + ((asm1[n] | bsm1[n]) != 0)); +#endif /* SMALLER_RECURSION */ + + /* v1, 2n+1 limbs */ +#ifdef SMALLER_RECURSION + mpn_mul_n (v1, as1, bs1, n); + if (as1[n] == 1) + { + cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); + } + else if (as1[n] == 2) + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy = 2 * bs1[n] + mpn_addlsh1_n_ip1 (v1 + n, bs1, n); +#else + cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); +#endif + } + else if (as1[n] != 0) + { + cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]); + } + else + cy = 0; + if (bs1[n] == 1) + { + cy += mpn_add_n (v1 + n, v1 + n, as1, n); + } + else if (bs1[n] == 2) + { +#if HAVE_NATIVE_mpn_addlsh1_n_ip1 + cy += mpn_addlsh1_n_ip1 (v1 + n, as1, n); +#else + cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2)); +#endif + } + v1[2 * n] = cy; +#else /* SMALLER_RECURSION */ + v1[2 * n] = 0; + mpn_mul_n (v1, as1, bs1, n + ((as1[n] | bs1[n]) != 0)); +#endif /* SMALLER_RECURSION */ + + mpn_mul_n (v0, a0, b0, n); /* v0, 2n limbs */ + + /* vinf, s+t limbs */ + if (s > t) mpn_mul (vinf, a4, s, b2, t); + else mpn_mul (vinf, b2, t, a4, s); + + mpn_toom_interpolate_7pts (pp, n, flags, vm2, vm1, v2, vh, s + t, + scratch_out); + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/toom54_mul.c b/gmp-6.3.0/mpn/generic/toom54_mul.c new file mode 100644 index 0000000..343b02e --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom54_mul.c @@ -0,0 +1,142 @@ +/* Implementation of the algorithm for Toom-Cook 4.5-way. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + + +/* Toom-4.5, the splitting 5x4 unbalanced version. + Evaluate in: infinity, +4, -4, +2, -2, +1, -1, 0. + + <--s-><--n--><--n--><--n--><--n--> + ____ ______ ______ ______ ______ + |_a4_|__a3__|__a2__|__a1__|__a0__| + |b3_|__b2__|__b1__|__b0__| + <-t-><--n--><--n--><--n--> + +*/ +#define TOOM_54_MUL_N_REC(p, a, b, n, ws) \ + do { mpn_mul_n (p, a, b, n); \ + } while (0) + +#define TOOM_54_MUL_REC(p, a, na, b, nb, ws) \ + do { mpn_mul (p, a, na, b, nb); \ + } while (0) + +void +mpn_toom54_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, mp_ptr scratch) +{ + mp_size_t n, s, t; + int sign; + + /***************************** decomposition *******************************/ +#define a4 (ap + 4 * n) +#define b3 (bp + 3 * n) + + ASSERT (an >= bn); + n = 1 + (4 * an >= 5 * bn ? (an - 1) / (size_t) 5 : (bn - 1) / (size_t) 4); + + s = an - 4 * n; + t = bn - 3 * n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + /* Required by mpn_toom_interpolate_8pts. */ + ASSERT ( s + t >= n ); + ASSERT ( s + t > 4); + ASSERT ( n > 2); + +#define r8 pp /* 2n */ +#define r7 scratch /* 3n+1 */ +#define r5 (pp + 3*n) /* 3n+1 */ +#define v0 (pp + 3*n) /* n+1 */ +#define v1 (pp + 4*n+1) /* n+1 */ +#define v2 (pp + 5*n+2) /* n+1 */ +#define v3 (pp + 6*n+3) /* n+1 */ +#define r3 (scratch + 3 * n + 1) /* 3n+1 */ +#define r1 (pp + 7*n) /* s+t <= 2*n */ +#define ws (scratch + 6 * n + 2) /* ??? */ + + /* Alloc also 3n+1 limbs for ws... mpn_toom_interpolate_8pts may + need all of them, when DO_mpn_sublsh_n usea a scratch */ + /********************** evaluation and recursive calls *********************/ + /* $\pm4$ */ + sign = mpn_toom_eval_pm2exp (v2, v0, 4, ap, n, s, 2, pp) + ^ mpn_toom_eval_pm2exp (v3, v1, 3, bp, n, t, 2, pp); + TOOM_54_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-4)*B(-4) */ + TOOM_54_MUL_N_REC(r3, v2, v3, n + 1, ws); /* A(+4)*B(+4) */ + mpn_toom_couple_handling (r3, 2*n+1, pp, sign, n, 2, 4); + + /* $\pm1$ */ + sign = mpn_toom_eval_pm1 (v2, v0, 4, ap, n, s, pp) + ^ mpn_toom_eval_dgr3_pm1 (v3, v1, bp, n, t, pp); + TOOM_54_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-1)*B(-1) */ + TOOM_54_MUL_N_REC(r7, v2, v3, n + 1, ws); /* A(1)*B(1) */ + mpn_toom_couple_handling (r7, 2*n+1, pp, sign, n, 0, 0); + + /* $\pm2$ */ + sign = mpn_toom_eval_pm2 (v2, v0, 4, ap, n, s, pp) + ^ mpn_toom_eval_dgr3_pm2 (v3, v1, bp, n, t, pp); + TOOM_54_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-2)*B(-2) */ + TOOM_54_MUL_N_REC(r5, v2, v3, n + 1, ws); /* A(+2)*B(+2) */ + mpn_toom_couple_handling (r5, 2*n+1, pp, sign, n, 1, 2); + + /* A(0)*B(0) */ + TOOM_54_MUL_N_REC(pp, ap, bp, n, ws); + + /* Infinity */ + if (s > t) { + TOOM_54_MUL_REC(r1, a4, s, b3, t, ws); + } else { + TOOM_54_MUL_REC(r1, b3, t, a4, s, ws); + }; + + mpn_toom_interpolate_8pts (pp, n, r3, r7, s + t, ws); + +#undef a4 +#undef b3 +#undef r1 +#undef r3 +#undef r5 +#undef v0 +#undef v1 +#undef v2 +#undef v3 +#undef r7 +#undef r8 +#undef ws +} diff --git a/gmp-6.3.0/mpn/generic/toom62_mul.c b/gmp-6.3.0/mpn/generic/toom62_mul.c new file mode 100644 index 0000000..d971cc0 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom62_mul.c @@ -0,0 +1,310 @@ +/* mpn_toom62_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 3 times + as large as bn. Or more accurately, (5/2)bn < an < 6bn. + + Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. + + The idea of applying toom to unbalanced multiplication is due to Marco + Bodrato and Alberto Zanoni. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006-2008, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluate in: + 0, +1, -1, +2, -2, 1/2, +inf + + <-s-><--n--><--n--><--n--><--n--><--n--> + ___ ______ ______ ______ ______ ______ + |a5_|___a4_|___a3_|___a2_|___a1_|___a0_| + |_b1_|___b0_| + <-t--><--n--> + + v0 = a0 * b0 # A(0)*B(0) + v1 = ( a0+ a1+ a2+ a3+ a4+ a5)*( b0+ b1) # A(1)*B(1) ah <= 5 bh <= 1 + vm1 = ( a0- a1+ a2- a3+ a4- a5)*( b0- b1) # A(-1)*B(-1) |ah| <= 2 bh = 0 + v2 = ( a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) # A(2)*B(2) ah <= 62 bh <= 2 + vm2 = ( a0- 2a1+4a2-8a3+16a4-32a5)*( b0-2b1) # A(-2)*B(-2) -41<=ah<=20 -1<=bh<=0 + vh = (32a0+16a1+8a2+4a3+ 2a4+ a5)*(2b0+ b1) # A(1/2)*B(1/2) ah <= 62 bh <= 2 + vinf= a5 * b1 # A(inf)*B(inf) +*/ + +void +mpn_toom62_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, + mp_ptr scratch) +{ + mp_size_t n, s, t; + mp_limb_t cy; + mp_ptr as1, asm1, as2, asm2, ash; + mp_ptr bs1, bsm1, bs2, bsm2, bsh; + mp_ptr gp; + enum toom7_flags aflags, bflags; + TMP_DECL; + +#define a0 ap +#define a1 (ap + n) +#define a2 (ap + 2*n) +#define a3 (ap + 3*n) +#define a4 (ap + 4*n) +#define a5 (ap + 5*n) +#define b0 bp +#define b1 (bp + n) + + n = 1 + (an >= 3 * bn ? (an - 1) / (size_t) 6 : (bn - 1) >> 1); + + s = an - 5 * n; + t = bn - n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + + TMP_MARK; + + as1 = TMP_SALLOC_LIMBS (n + 1); + asm1 = TMP_SALLOC_LIMBS (n + 1); + as2 = TMP_SALLOC_LIMBS (n + 1); + asm2 = TMP_SALLOC_LIMBS (n + 1); + ash = TMP_SALLOC_LIMBS (n + 1); + + bs1 = TMP_SALLOC_LIMBS (n + 1); + bsm1 = TMP_SALLOC_LIMBS (n); + bs2 = TMP_SALLOC_LIMBS (n + 1); + bsm2 = TMP_SALLOC_LIMBS (n + 1); + bsh = TMP_SALLOC_LIMBS (n + 1); + + gp = pp; + + /* Compute as1 and asm1. */ + aflags = (enum toom7_flags) (toom7_w3_neg & mpn_toom_eval_pm1 (as1, asm1, 5, ap, n, s, gp)); + + /* Compute as2 and asm2. */ + aflags = (enum toom7_flags) (aflags | (toom7_w1_neg & mpn_toom_eval_pm2 (as2, asm2, 5, ap, n, s, gp))); + + /* Compute ash = 32 a0 + 16 a1 + 8 a2 + 4 a3 + 2 a4 + a5 + = 2*(2*(2*(2*(2*a0 + a1) + a2) + a3) + a4) + a5 */ + +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (ash, a1, a0, n); + cy = 2*cy + mpn_addlsh1_n (ash, a2, ash, n); + cy = 2*cy + mpn_addlsh1_n (ash, a3, ash, n); + cy = 2*cy + mpn_addlsh1_n (ash, a4, ash, n); + if (s < n) + { + mp_limb_t cy2; + cy2 = mpn_addlsh1_n (ash, a5, ash, s); + ash[n] = 2*cy + mpn_lshift (ash + s, ash + s, n - s, 1); + MPN_INCR_U (ash + s, n+1-s, cy2); + } + else + ash[n] = 2*cy + mpn_addlsh1_n (ash, a5, ash, n); +#else + cy = mpn_lshift (ash, a0, n, 1); + cy += mpn_add_n (ash, ash, a1, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + cy += mpn_add_n (ash, ash, a2, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + cy += mpn_add_n (ash, ash, a3, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + cy += mpn_add_n (ash, ash, a4, n); + cy = 2*cy + mpn_lshift (ash, ash, n, 1); + ash[n] = cy + mpn_add (ash, ash, n, a5, s); +#endif + + /* Compute bs1 and bsm1. */ + if (t == n) + { +#if HAVE_NATIVE_mpn_add_n_sub_n + if (mpn_cmp (b0, b1, n) < 0) + { + cy = mpn_add_n_sub_n (bs1, bsm1, b1, b0, n); + bflags = toom7_w3_neg; + } + else + { + cy = mpn_add_n_sub_n (bs1, bsm1, b0, b1, n); + bflags = (enum toom7_flags) 0; + } + bs1[n] = cy >> 1; +#else + bs1[n] = mpn_add_n (bs1, b0, b1, n); + if (mpn_cmp (b0, b1, n) < 0) + { + mpn_sub_n (bsm1, b1, b0, n); + bflags = toom7_w3_neg; + } + else + { + mpn_sub_n (bsm1, b0, b1, n); + bflags = (enum toom7_flags) 0; + } +#endif + } + else + { + bs1[n] = mpn_add (bs1, b0, n, b1, t); + if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) + { + mpn_sub_n (bsm1, b1, b0, t); + MPN_ZERO (bsm1 + t, n - t); + bflags = toom7_w3_neg; + } + else + { + mpn_sub (bsm1, b0, n, b1, t); + bflags = (enum toom7_flags) 0; + } + } + + /* Compute bs2 and bsm2. Recycling bs1 and bsm1; bs2=bs1+b1, bsm2 = + bsm1 - b1 */ + mpn_add (bs2, bs1, n + 1, b1, t); + if (bflags & toom7_w3_neg) + { + bsm2[n] = mpn_add (bsm2, bsm1, n, b1, t); + bflags = (enum toom7_flags) (bflags | toom7_w1_neg); + } + else + { + /* FIXME: Simplify this logic? */ + if (t < n) + { + if (mpn_zero_p (bsm1 + t, n - t) && mpn_cmp (bsm1, b1, t) < 0) + { + ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, t)); + MPN_ZERO (bsm2 + t, n + 1 - t); + bflags = (enum toom7_flags) (bflags | toom7_w1_neg); + } + else + { + ASSERT_NOCARRY (mpn_sub (bsm2, bsm1, n, b1, t)); + bsm2[n] = 0; + } + } + else + { + if (mpn_cmp (bsm1, b1, n) < 0) + { + ASSERT_NOCARRY (mpn_sub_n (bsm2, b1, bsm1, n)); + bflags = (enum toom7_flags) (bflags | toom7_w1_neg); + } + else + { + ASSERT_NOCARRY (mpn_sub_n (bsm2, bsm1, b1, n)); + } + bsm2[n] = 0; + } + } + + /* Compute bsh, recycling bs1. bsh=bs1+b0; */ + bsh[n] = bs1[n] + mpn_add_n (bsh, bs1, b0, n); + + ASSERT (as1[n] <= 5); + ASSERT (bs1[n] <= 1); + ASSERT (asm1[n] <= 2); + ASSERT (as2[n] <= 62); + ASSERT (bs2[n] <= 2); + ASSERT (asm2[n] <= 41); + ASSERT (bsm2[n] <= 1); + ASSERT (ash[n] <= 62); + ASSERT (bsh[n] <= 2); + +#define v0 pp /* 2n */ +#define v1 (pp + 2 * n) /* 2n+1 */ +#define vinf (pp + 6 * n) /* s+t */ +#define v2 scratch /* 2n+1 */ +#define vm2 (scratch + 2 * n + 1) /* 2n+1 */ +#define vh (scratch + 4 * n + 2) /* 2n+1 */ +#define vm1 (scratch + 6 * n + 3) /* 2n+1 */ +#define scratch_out (scratch + 8 * n + 4) /* 2n+1 */ + /* Total scratch need: 10*n+5 */ + + /* Must be in allocation order, as they overwrite one limb beyond + * 2n+1. */ + mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */ + mpn_mul_n (vm2, asm2, bsm2, n + 1); /* vm2, 2n+1 limbs */ + mpn_mul_n (vh, ash, bsh, n + 1); /* vh, 2n+1 limbs */ + + /* vm1, 2n+1 limbs */ + mpn_mul_n (vm1, asm1, bsm1, n); + cy = 0; + if (asm1[n] == 1) + { + cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n); + } + else if (asm1[n] == 2) + { +#if HAVE_NATIVE_mpn_addlsh1_n + cy = mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n); +#else + cy = mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2)); +#endif + } + vm1[2 * n] = cy; + + /* v1, 2n+1 limbs */ + mpn_mul_n (v1, as1, bs1, n); + if (as1[n] == 1) + { + cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); + } + else if (as1[n] == 2) + { +#if HAVE_NATIVE_mpn_addlsh1_n + cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n); +#else + cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); +#endif + } + else if (as1[n] != 0) + { + cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]); + } + else + cy = 0; + if (bs1[n] != 0) + cy += mpn_add_n (v1 + n, v1 + n, as1, n); + v1[2 * n] = cy; + + mpn_mul_n (v0, a0, b0, n); /* v0, 2n limbs */ + + /* vinf, s+t limbs */ + if (s > t) mpn_mul (vinf, a5, s, b1, t); + else mpn_mul (vinf, b1, t, a5, s); + + mpn_toom_interpolate_7pts (pp, n, (enum toom7_flags) (aflags ^ bflags), + vm2, vm1, v2, vh, s + t, scratch_out); + + TMP_FREE; +} diff --git a/gmp-6.3.0/mpn/generic/toom63_mul.c b/gmp-6.3.0/mpn/generic/toom63_mul.c new file mode 100644 index 0000000..181996d --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom63_mul.c @@ -0,0 +1,231 @@ +/* Implementation of the algorithm for Toom-Cook 4.5-way. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Stores |{ap,n}-{bp,n}| in {rp,n}, returns the sign. */ +static int +abs_sub_n (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n) +{ + mp_limb_t x, y; + while (--n >= 0) + { + x = ap[n]; + y = bp[n]; + if (x != y) + { + n++; + if (x > y) + { + mpn_sub_n (rp, ap, bp, n); + return 0; + } + else + { + mpn_sub_n (rp, bp, ap, n); + return ~0; + } + } + rp[n] = 0; + } + return 0; +} + +static int +abs_sub_add_n (mp_ptr rm, mp_ptr rp, mp_srcptr rs, mp_size_t n) { + int result; + result = abs_sub_n (rm, rp, rs, n); + ASSERT_NOCARRY(mpn_add_n (rp, rp, rs, n)); + return result; +} + + +/* Toom-4.5, the splitting 6x3 unbalanced version. + Evaluate in: infinity, +4, -4, +2, -2, +1, -1, 0. + + <--s-><--n--><--n--><--n--><--n--><--n--> + ____ ______ ______ ______ ______ ______ + |_a5_|__a4__|__a3__|__a2__|__a1__|__a0__| + |b2_|__b1__|__b0__| + <-t-><--n--><--n--> + +*/ +#define TOOM_63_MUL_N_REC(p, a, b, n, ws) \ + do { mpn_mul_n (p, a, b, n); \ + } while (0) + +#define TOOM_63_MUL_REC(p, a, na, b, nb, ws) \ + do { mpn_mul (p, a, na, b, nb); \ + } while (0) + +void +mpn_toom63_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, mp_ptr scratch) +{ + mp_size_t n, s, t; + mp_limb_t cy; + int sign; + + /***************************** decomposition *******************************/ +#define a5 (ap + 5 * n) +#define b0 (bp + 0 * n) +#define b1 (bp + 1 * n) +#define b2 (bp + 2 * n) + + ASSERT (an >= bn); + n = 1 + (an >= 2 * bn ? (an - 1) / (size_t) 6 : (bn - 1) / (size_t) 3); + + s = an - 5 * n; + t = bn - 2 * n; + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + /* WARNING! it assumes s+t>=n */ + ASSERT ( s + t >= n ); + ASSERT ( s + t > 4); + /* WARNING! it assumes n>1 */ + ASSERT ( n > 2); + +#define r8 pp /* 2n */ +#define r7 scratch /* 3n+1 */ +#define r5 (pp + 3*n) /* 3n+1 */ +#define v0 (pp + 3*n) /* n+1 */ +#define v1 (pp + 4*n+1) /* n+1 */ +#define v2 (pp + 5*n+2) /* n+1 */ +#define v3 (pp + 6*n+3) /* n+1 */ +#define r3 (scratch + 3 * n + 1) /* 3n+1 */ +#define r1 (pp + 7*n) /* s+t <= 2*n */ +#define ws (scratch + 6 * n + 2) /* ??? */ + + /* Alloc also 3n+1 limbs for ws... mpn_toom_interpolate_8pts may + need all of them, when DO_mpn_sublsh_n usea a scratch */ +/* if (scratch == NULL) scratch = TMP_SALLOC_LIMBS (9 * n + 3); */ + + /********************** evaluation and recursive calls *********************/ + /* $\pm4$ */ + sign = mpn_toom_eval_pm2exp (v2, v0, 5, ap, n, s, 2, pp); + pp[n] = mpn_lshift (pp, b1, n, 2); /* 4b1 */ + /* FIXME: use addlsh */ + v3[t] = mpn_lshift (v3, b2, t, 4);/* 16b2 */ + if ( n == t ) + v3[n]+= mpn_add_n (v3, v3, b0, n); /* 16b2+b0 */ + else + v3[n] = mpn_add (v3, b0, n, v3, t+1); /* 16b2+b0 */ + sign ^= abs_sub_add_n (v1, v3, pp, n + 1); + TOOM_63_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-4)*B(-4) */ + TOOM_63_MUL_N_REC(r3, v2, v3, n + 1, ws); /* A(+4)*B(+4) */ + mpn_toom_couple_handling (r3, 2*n+1, pp, sign, n, 2, 4); + + /* $\pm1$ */ + sign = mpn_toom_eval_pm1 (v2, v0, 5, ap, n, s, pp); + /* Compute bs1 and bsm1. Code taken from toom33 */ + cy = mpn_add (ws, b0, n, b2, t); +#if HAVE_NATIVE_mpn_add_n_sub_n + if (cy == 0 && mpn_cmp (ws, b1, n) < 0) + { + cy = mpn_add_n_sub_n (v3, v1, b1, ws, n); + v3[n] = cy >> 1; + v1[n] = 0; + sign = ~sign; + } + else + { + mp_limb_t cy2; + cy2 = mpn_add_n_sub_n (v3, v1, ws, b1, n); + v3[n] = cy + (cy2 >> 1); + v1[n] = cy - (cy2 & 1); + } +#else + v3[n] = cy + mpn_add_n (v3, ws, b1, n); + if (cy == 0 && mpn_cmp (ws, b1, n) < 0) + { + mpn_sub_n (v1, b1, ws, n); + v1[n] = 0; + sign = ~sign; + } + else + { + cy -= mpn_sub_n (v1, ws, b1, n); + v1[n] = cy; + } +#endif + TOOM_63_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-1)*B(-1) */ + TOOM_63_MUL_N_REC(r7, v2, v3, n + 1, ws); /* A(1)*B(1) */ + mpn_toom_couple_handling (r7, 2*n+1, pp, sign, n, 0, 0); + + /* $\pm2$ */ + sign = mpn_toom_eval_pm2 (v2, v0, 5, ap, n, s, pp); + pp[n] = mpn_lshift (pp, b1, n, 1); /* 2b1 */ + /* FIXME: use addlsh or addlsh2 */ + v3[t] = mpn_lshift (v3, b2, t, 2);/* 4b2 */ + if ( n == t ) + v3[n]+= mpn_add_n (v3, v3, b0, n); /* 4b2+b0 */ + else + v3[n] = mpn_add (v3, b0, n, v3, t+1); /* 4b2+b0 */ + sign ^= abs_sub_add_n (v1, v3, pp, n + 1); + TOOM_63_MUL_N_REC(pp, v0, v1, n + 1, ws); /* A(-2)*B(-2) */ + TOOM_63_MUL_N_REC(r5, v2, v3, n + 1, ws); /* A(+2)*B(+2) */ + mpn_toom_couple_handling (r5, 2*n+1, pp, sign, n, 1, 2); + + /* A(0)*B(0) */ + TOOM_63_MUL_N_REC(pp, ap, bp, n, ws); + + /* Infinity */ + if (s > t) { + TOOM_63_MUL_REC(r1, a5, s, b2, t, ws); + } else { + TOOM_63_MUL_REC(r1, b2, t, a5, s, ws); + }; + + mpn_toom_interpolate_8pts (pp, n, r3, r7, s + t, ws); + +#undef a5 +#undef b0 +#undef b1 +#undef b2 +#undef r1 +#undef r3 +#undef r5 +#undef v0 +#undef v1 +#undef v2 +#undef v3 +#undef r7 +#undef r8 +#undef ws +} diff --git a/gmp-6.3.0/mpn/generic/toom6_sqr.c b/gmp-6.3.0/mpn/generic/toom6_sqr.c new file mode 100644 index 0000000..336eef9 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom6_sqr.c @@ -0,0 +1,181 @@ +/* Implementation of the squaring algorithm with Toom-Cook 6.5-way. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + +#if GMP_NUMB_BITS < 21 +#error Not implemented. +#endif + + +#if TUNE_PROGRAM_BUILD +#define MAYBE_sqr_basecase 1 +#define MAYBE_sqr_above_basecase 1 +#define MAYBE_sqr_toom2 1 +#define MAYBE_sqr_above_toom2 1 +#define MAYBE_sqr_toom3 1 +#define MAYBE_sqr_above_toom3 1 +#define MAYBE_sqr_above_toom4 1 +#else +#ifdef SQR_TOOM8_THRESHOLD +#define SQR_TOOM6_MAX ((SQR_TOOM8_THRESHOLD+6*2-1+5)/6) +#else +#define SQR_TOOM6_MAX \ + ((SQR_FFT_THRESHOLD <= MP_SIZE_T_MAX - (6*2-1+5)) ? \ + ((SQR_FFT_THRESHOLD+6*2-1+5)/6) \ + : MP_SIZE_T_MAX ) +#endif +#define MAYBE_sqr_basecase \ + (SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM2_THRESHOLD) +#define MAYBE_sqr_above_basecase \ + (SQR_TOOM6_MAX >= SQR_TOOM2_THRESHOLD) +#define MAYBE_sqr_toom2 \ + (SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM3_THRESHOLD) +#define MAYBE_sqr_above_toom2 \ + (SQR_TOOM6_MAX >= SQR_TOOM3_THRESHOLD) +#define MAYBE_sqr_toom3 \ + (SQR_TOOM6_THRESHOLD < 6 * SQR_TOOM4_THRESHOLD) +#define MAYBE_sqr_above_toom3 \ + (SQR_TOOM6_MAX >= SQR_TOOM4_THRESHOLD) +#define MAYBE_sqr_above_toom4 \ + (SQR_TOOM6_MAX >= SQR_TOOM6_THRESHOLD) +#endif + +#define TOOM6_SQR_REC(p, a, n, ws) \ + do { \ + if (MAYBE_sqr_basecase && ( !MAYBE_sqr_above_basecase \ + || BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))) \ + mpn_sqr_basecase (p, a, n); \ + else if (MAYBE_sqr_toom2 && ( !MAYBE_sqr_above_toom2 \ + || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))) \ + mpn_toom2_sqr (p, a, n, ws); \ + else if (MAYBE_sqr_toom3 && ( !MAYBE_sqr_above_toom3 \ + || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))) \ + mpn_toom3_sqr (p, a, n, ws); \ + else if (! MAYBE_sqr_above_toom4 \ + || BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD)) \ + mpn_toom4_sqr (p, a, n, ws); \ + else \ + mpn_toom6_sqr (p, a, n, ws); \ + } while (0) + +void +mpn_toom6_sqr (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_ptr scratch) +{ + mp_size_t n, s; + + /***************************** decomposition *******************************/ + + ASSERT( an >= 18 ); + + n = 1 + (an - 1) / (size_t) 6; + + s = an - 5 * n; + + ASSERT (0 < s && s <= n); + +#define r4 (pp + 3 * n) /* 3n+1 */ +#define r2 (pp + 7 * n) /* 3n+1 */ +#define r0 (pp +11 * n) /* s+t <= 2*n */ +#define r5 (scratch) /* 3n+1 */ +#define r3 (scratch + 3 * n + 1) /* 3n+1 */ +#define r1 (scratch + 6 * n + 2) /* 3n+1 */ +#define v0 (pp + 7 * n) /* n+1 */ +#define v2 (pp + 9 * n+2) /* n+1 */ +#define wse (scratch + 9 * n + 3) /* 3n+1 */ + + /* Alloc also 3n+1 limbs for ws... toom_interpolate_12pts may + need all of them, when DO_mpn_sublsh_n usea a scratch */ +/* if (scratch== NULL) */ +/* scratch = TMP_SALLOC_LIMBS (12 * n + 6); */ + + /********************** evaluation and recursive calls *********************/ + /* $\pm1/2$ */ + mpn_toom_eval_pm2rexp (v2, v0, 5, ap, n, s, 1, pp); + TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-1/2)*B(-1/2)*2^. */ + TOOM6_SQR_REC(r5, v2, n + 1, wse); /* A(+1/2)*B(+1/2)*2^. */ + mpn_toom_couple_handling (r5, 2 * n + 1, pp, 0, n, 1, 0); + + /* $\pm1$ */ + mpn_toom_eval_pm1 (v2, v0, 5, ap, n, s, pp); + TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-1)*B(-1) */ + TOOM6_SQR_REC(r3, v2, n + 1, wse); /* A(1)*B(1) */ + mpn_toom_couple_handling (r3, 2 * n + 1, pp, 0, n, 0, 0); + + /* $\pm4$ */ + mpn_toom_eval_pm2exp (v2, v0, 5, ap, n, s, 2, pp); + TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-4)*B(-4) */ + TOOM6_SQR_REC(r1, v2, n + 1, wse); /* A(+4)*B(+4) */ + mpn_toom_couple_handling (r1, 2 * n + 1, pp, 0, n, 2, 4); + + /* $\pm1/4$ */ + mpn_toom_eval_pm2rexp (v2, v0, 5, ap, n, s, 2, pp); + TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-1/4)*B(-1/4)*4^. */ + TOOM6_SQR_REC(r4, v2, n + 1, wse); /* A(+1/4)*B(+1/4)*4^. */ + mpn_toom_couple_handling (r4, 2 * n + 1, pp, 0, n, 2, 0); + + /* $\pm2$ */ + mpn_toom_eval_pm2 (v2, v0, 5, ap, n, s, pp); + TOOM6_SQR_REC(pp, v0, n + 1, wse); /* A(-2)*B(-2) */ + TOOM6_SQR_REC(r2, v2, n + 1, wse); /* A(+2)*B(+2) */ + mpn_toom_couple_handling (r2, 2 * n + 1, pp, 0, n, 1, 2); + +#undef v0 +#undef v2 + + /* A(0)*B(0) */ + TOOM6_SQR_REC(pp, ap, n, wse); + + mpn_toom_interpolate_12pts (pp, r1, r3, r5, n, 2 * s, 0, wse); + +#undef r0 +#undef r1 +#undef r2 +#undef r3 +#undef r4 +#undef r5 + +} +#undef TOOM6_SQR_REC +#undef MAYBE_sqr_basecase +#undef MAYBE_sqr_above_basecase +#undef MAYBE_sqr_toom2 +#undef MAYBE_sqr_above_toom2 +#undef MAYBE_sqr_toom3 +#undef MAYBE_sqr_above_toom3 +#undef MAYBE_sqr_above_toom4 diff --git a/gmp-6.3.0/mpn/generic/toom6h_mul.c b/gmp-6.3.0/mpn/generic/toom6h_mul.c new file mode 100644 index 0000000..637f2a5 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom6h_mul.c @@ -0,0 +1,262 @@ +/* Implementation of the multiplication algorithm for Toom-Cook 6.5-way. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2010, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + +#if GMP_NUMB_BITS < 21 +#error Not implemented. +#endif + +#if TUNE_PROGRAM_BUILD +#define MAYBE_mul_basecase 1 +#define MAYBE_mul_toom22 1 +#define MAYBE_mul_toom33 1 +#define MAYBE_mul_toom6h 1 +#else +#define MAYBE_mul_basecase \ + (MUL_TOOM6H_THRESHOLD < 6 * MUL_TOOM22_THRESHOLD) +#define MAYBE_mul_toom22 \ + (MUL_TOOM6H_THRESHOLD < 6 * MUL_TOOM33_THRESHOLD) +#define MAYBE_mul_toom33 \ + (MUL_TOOM6H_THRESHOLD < 6 * MUL_TOOM44_THRESHOLD) +#define MAYBE_mul_toom6h \ + (MUL_FFT_THRESHOLD >= 6 * MUL_TOOM6H_THRESHOLD) +#endif + +#define TOOM6H_MUL_N_REC(p, a, b, f, p2, a2, b2, n, ws) \ + do { \ + if (MAYBE_mul_basecase \ + && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) { \ + mpn_mul_basecase (p, a, n, b, n); \ + if (f) \ + mpn_mul_basecase (p2, a2, n, b2, n); \ + } else if (MAYBE_mul_toom22 \ + && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) { \ + mpn_toom22_mul (p, a, n, b, n, ws); \ + if (f) \ + mpn_toom22_mul (p2, a2, n, b2, n, ws); \ + } else if (MAYBE_mul_toom33 \ + && BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) { \ + mpn_toom33_mul (p, a, n, b, n, ws); \ + if (f) \ + mpn_toom33_mul (p2, a2, n, b2, n, ws); \ + } else if (! MAYBE_mul_toom6h \ + || BELOW_THRESHOLD (n, MUL_TOOM6H_THRESHOLD)) { \ + mpn_toom44_mul (p, a, n, b, n, ws); \ + if (f) \ + mpn_toom44_mul (p2, a2, n, b2, n, ws); \ + } else { \ + mpn_toom6h_mul (p, a, n, b, n, ws); \ + if (f) \ + mpn_toom6h_mul (p2, a2, n, b2, n, ws); \ + } \ + } while (0) + +#define TOOM6H_MUL_REC(p, a, na, b, nb, ws) \ + do { mpn_mul (p, a, na, b, nb); \ + } while (0) + +/* Toom-6.5 , compute the product {pp,an+bn} <- {ap,an} * {bp,bn} + With: an >= bn >= 46, an*6 < bn * 17. + It _may_ work with bn<=46 and bn*17 < an*6 < bn*18 + + Evaluate in: infinity, +4, -4, +2, -2, +1, -1, +1/2, -1/2, +1/4, -1/4, 0. +*/ +/* Estimate on needed scratch: + S(n) <= (n+5)\6*10+4+MAX(S((n+5)\6),1+2*(n+5)\6), + since n>42; S(n) <= ceil(log(n)/log(6))*(10+4)+n*12\6 < n*2 + lg2(n)*6 + */ + +void +mpn_toom6h_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, mp_ptr scratch) +{ + mp_size_t n, s, t; + int p, q, half; + int sign; + + /***************************** decomposition *******************************/ + + ASSERT (an >= bn); + /* Can not handle too much unbalancement */ + ASSERT (bn >= 42); + /* Can not handle too much unbalancement */ + ASSERT ((an*3 < bn * 8) || (bn >= 46 && an * 6 < bn * 17)); + + /* Limit num/den is a rational number between + (12/11)^(log(4)/log(2*4-1)) and (12/11)^(log(6)/log(2*6-1)) */ +#define LIMIT_numerator (18) +#define LIMIT_denominat (17) + + if (LIKELY (an * LIMIT_denominat < LIMIT_numerator * bn)) /* is 6*... < 6*... */ + { + n = 1 + (an - 1) / (size_t) 6; + p = q = 5; + half = 0; + + s = an - 5 * n; + t = bn - 5 * n; + } + else { + if (an * 5 * LIMIT_numerator < LIMIT_denominat * 7 * bn) + { p = 7; q = 6; } + else if (an * 5 * LIMIT_denominat < LIMIT_numerator * 7 * bn) + { p = 7; q = 5; } + else if (an * LIMIT_numerator < LIMIT_denominat * 2 * bn) /* is 4*... < 8*... */ + { p = 8; q = 5; } + else if (an * LIMIT_denominat < LIMIT_numerator * 2 * bn) /* is 4*... < 8*... */ + { p = 8; q = 4; } + else + { p = 9; q = 4; } + + half = (p ^ q) & 1; + n = 1 + (q * an >= p * bn ? (an - 1) / (size_t) p : (bn - 1) / (size_t) q); + p--; q--; + + s = an - p * n; + t = bn - q * n; + + /* With LIMIT = 16/15, the following recover is needed only if bn<=73*/ + if (half) { /* Recover from badly chosen splitting */ + if (UNLIKELY (s<1)) {p--; s+=n; half=0;} + else if (UNLIKELY (t<1)) {q--; t+=n; half=0;} + } + } +#undef LIMIT_numerator +#undef LIMIT_denominat + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + ASSERT (half || s + t > 3); + ASSERT (n > 2); + +#define r4 (pp + 3 * n) /* 3n+1 */ +#define r2 (pp + 7 * n) /* 3n+1 */ +#define r0 (pp +11 * n) /* s+t <= 2*n */ +#define r5 (scratch) /* 3n+1 */ +#define r3 (scratch + 3 * n + 1) /* 3n+1 */ +#define r1 (scratch + 6 * n + 2) /* 3n+1 */ +#define v0 (pp + 7 * n) /* n+1 */ +#define v1 (pp + 8 * n+1) /* n+1 */ +#define v2 (pp + 9 * n+2) /* n+1 */ +#define v3 (scratch + 9 * n + 3) /* n+1 */ +#define wsi (scratch + 9 * n + 3) /* 3n+1 */ +#define wse (scratch +10 * n + 4) /* 2n+1 */ + + /* Alloc also 3n+1 limbs for wsi... toom_interpolate_12pts may + need all of them */ +/* if (scratch == NULL) */ +/* scratch = TMP_SALLOC_LIMBS(mpn_toom6_sqr_itch(n * 6)); */ + ASSERT (12 * n + 6 <= mpn_toom6h_mul_itch(an,bn)); + ASSERT (12 * n + 6 <= mpn_toom6_sqr_itch(n * 6)); + + /********************** evaluation and recursive calls *********************/ + /* $\pm1/2$ */ + sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 1, pp) ^ + mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 1, pp); + /* A(-1/2)*B(-1/2)*2^. */ /* A(+1/2)*B(+1/2)*2^. */ + TOOM6H_MUL_N_REC(pp, v0, v1, 2, r5, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r5, 2 * n + 1, pp, sign, n, 1+half , half); + + /* $\pm1$ */ + sign = mpn_toom_eval_pm1 (v2, v0, p, ap, n, s, pp); + if (UNLIKELY (q == 3)) + sign ^= mpn_toom_eval_dgr3_pm1 (v3, v1, bp, n, t, pp); + else + sign ^= mpn_toom_eval_pm1 (v3, v1, q, bp, n, t, pp); + /* A(-1)*B(-1) */ /* A(1)*B(1) */ + TOOM6H_MUL_N_REC(pp, v0, v1, 2, r3, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r3, 2 * n + 1, pp, sign, n, 0, 0); + + /* $\pm4$ */ + sign = mpn_toom_eval_pm2exp (v2, v0, p, ap, n, s, 2, pp) ^ + mpn_toom_eval_pm2exp (v3, v1, q, bp, n, t, 2, pp); + /* A(-4)*B(-4) */ + TOOM6H_MUL_N_REC(pp, v0, v1, 2, r1, v2, v3, n + 1, wse); /* A(+4)*B(+4) */ + mpn_toom_couple_handling (r1, 2 * n + 1, pp, sign, n, 2, 4); + + /* $\pm1/4$ */ + sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 2, pp) ^ + mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 2, pp); + /* A(-1/4)*B(-1/4)*4^. */ /* A(+1/4)*B(+1/4)*4^. */ + TOOM6H_MUL_N_REC(pp, v0, v1, 2, r4, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r4, 2 * n + 1, pp, sign, n, 2*(1+half), 2*(half)); + + /* $\pm2$ */ + sign = mpn_toom_eval_pm2 (v2, v0, p, ap, n, s, pp) ^ + mpn_toom_eval_pm2 (v3, v1, q, bp, n, t, pp); + /* A(-2)*B(-2) */ /* A(+2)*B(+2) */ + TOOM6H_MUL_N_REC(pp, v0, v1, 2, r2, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r2, 2 * n + 1, pp, sign, n, 1, 2); + +#undef v0 +#undef v1 +#undef v2 +#undef v3 +#undef wse + + /* A(0)*B(0) */ + TOOM6H_MUL_N_REC(pp, ap, bp, 0, pp, ap, bp, n, wsi); + + /* Infinity */ + if (UNLIKELY (half != 0)) { + if (s > t) { + TOOM6H_MUL_REC(r0, ap + p * n, s, bp + q * n, t, wsi); + } else { + TOOM6H_MUL_REC(r0, bp + q * n, t, ap + p * n, s, wsi); + }; + }; + + mpn_toom_interpolate_12pts (pp, r1, r3, r5, n, s+t, half, wsi); + +#undef r0 +#undef r1 +#undef r2 +#undef r3 +#undef r4 +#undef r5 +#undef wsi +} + +#undef TOOM6H_MUL_N_REC +#undef TOOM6H_MUL_REC +#undef MAYBE_mul_basecase +#undef MAYBE_mul_toom22 +#undef MAYBE_mul_toom33 +#undef MAYBE_mul_toom6h diff --git a/gmp-6.3.0/mpn/generic/toom8_sqr.c b/gmp-6.3.0/mpn/generic/toom8_sqr.c new file mode 100644 index 0000000..03e5c64 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom8_sqr.c @@ -0,0 +1,225 @@ +/* Implementation of the squaring algorithm with Toom-Cook 8.5-way. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +#if GMP_NUMB_BITS < 29 +#error Not implemented. +#endif + +#if GMP_NUMB_BITS < 43 +#define BIT_CORRECTION 1 +#define CORRECTION_BITS GMP_NUMB_BITS +#else +#define BIT_CORRECTION 0 +#define CORRECTION_BITS 0 +#endif + +#ifndef SQR_TOOM8_THRESHOLD +#define SQR_TOOM8_THRESHOLD MUL_TOOM8H_THRESHOLD +#endif + +#ifndef SQR_TOOM6_THRESHOLD +#define SQR_TOOM6_THRESHOLD MUL_TOOM6H_THRESHOLD +#endif + +#if TUNE_PROGRAM_BUILD +#define MAYBE_sqr_basecase 1 +#define MAYBE_sqr_above_basecase 1 +#define MAYBE_sqr_toom2 1 +#define MAYBE_sqr_above_toom2 1 +#define MAYBE_sqr_toom3 1 +#define MAYBE_sqr_above_toom3 1 +#define MAYBE_sqr_toom4 1 +#define MAYBE_sqr_above_toom4 1 +#define MAYBE_sqr_above_toom6 1 +#else +#define SQR_TOOM8_MAX \ + ((SQR_FFT_THRESHOLD <= MP_SIZE_T_MAX - (8*2-1+7)) ? \ + ((SQR_FFT_THRESHOLD+8*2-1+7)/8) \ + : MP_SIZE_T_MAX ) +#define MAYBE_sqr_basecase \ + (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM2_THRESHOLD) +#define MAYBE_sqr_above_basecase \ + (SQR_TOOM8_MAX >= SQR_TOOM2_THRESHOLD) +#define MAYBE_sqr_toom2 \ + (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM3_THRESHOLD) +#define MAYBE_sqr_above_toom2 \ + (SQR_TOOM8_MAX >= SQR_TOOM3_THRESHOLD) +#define MAYBE_sqr_toom3 \ + (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM4_THRESHOLD) +#define MAYBE_sqr_above_toom3 \ + (SQR_TOOM8_MAX >= SQR_TOOM4_THRESHOLD) +#define MAYBE_sqr_toom4 \ + (SQR_TOOM8_THRESHOLD < 8 * SQR_TOOM6_THRESHOLD) +#define MAYBE_sqr_above_toom4 \ + (SQR_TOOM8_MAX >= SQR_TOOM6_THRESHOLD) +#define MAYBE_sqr_above_toom6 \ + (SQR_TOOM8_MAX >= SQR_TOOM8_THRESHOLD) +#endif + +#define TOOM8_SQR_REC(p, a, f, p2, a2, n, ws) \ + do { \ + if (MAYBE_sqr_basecase && ( !MAYBE_sqr_above_basecase \ + || BELOW_THRESHOLD (n, SQR_TOOM2_THRESHOLD))) { \ + mpn_sqr_basecase (p, a, n); \ + if (f) mpn_sqr_basecase (p2, a2, n); \ + } else if (MAYBE_sqr_toom2 && ( !MAYBE_sqr_above_toom2 \ + || BELOW_THRESHOLD (n, SQR_TOOM3_THRESHOLD))) { \ + mpn_toom2_sqr (p, a, n, ws); \ + if (f) mpn_toom2_sqr (p2, a2, n, ws); \ + } else if (MAYBE_sqr_toom3 && ( !MAYBE_sqr_above_toom3 \ + || BELOW_THRESHOLD (n, SQR_TOOM4_THRESHOLD))) { \ + mpn_toom3_sqr (p, a, n, ws); \ + if (f) mpn_toom3_sqr (p2, a2, n, ws); \ + } else if (MAYBE_sqr_toom4 && ( !MAYBE_sqr_above_toom4 \ + || BELOW_THRESHOLD (n, SQR_TOOM6_THRESHOLD))) { \ + mpn_toom4_sqr (p, a, n, ws); \ + if (f) mpn_toom4_sqr (p2, a2, n, ws); \ + } else if (! MAYBE_sqr_above_toom6 \ + || BELOW_THRESHOLD (n, SQR_TOOM8_THRESHOLD)) { \ + mpn_toom6_sqr (p, a, n, ws); \ + if (f) mpn_toom6_sqr (p2, a2, n, ws); \ + } else { \ + mpn_toom8_sqr (p, a, n, ws); \ + if (f) mpn_toom8_sqr (p2, a2, n, ws); \ + } \ + } while (0) + +void +mpn_toom8_sqr (mp_ptr pp, mp_srcptr ap, mp_size_t an, mp_ptr scratch) +{ + mp_size_t n, s; + + /***************************** decomposition *******************************/ + + ASSERT ( an >= 40 ); + + n = 1 + ((an - 1)>>3); + + s = an - 7 * n; + + ASSERT (0 < s && s <= n); + ASSERT ( s + s > 3 ); + +#define r6 (pp + 3 * n) /* 3n+1 */ +#define r4 (pp + 7 * n) /* 3n+1 */ +#define r2 (pp +11 * n) /* 3n+1 */ +#define r0 (pp +15 * n) /* s+t <= 2*n */ +#define r7 (scratch) /* 3n+1 */ +#define r5 (scratch + 3 * n + 1) /* 3n+1 */ +#define r3 (scratch + 6 * n + 2) /* 3n+1 */ +#define r1 (scratch + 9 * n + 3) /* 3n+1 */ +#define v0 (pp +11 * n) /* n+1 */ +#define v2 (pp +13 * n+2) /* n+1 */ +#define wse (scratch +12 * n + 4) /* 3n+1 */ + + /* Alloc also 3n+1 limbs for ws... toom_interpolate_16pts may + need all of them, when DO_mpn_sublsh_n usea a scratch */ +/* if (scratch == NULL) */ +/* scratch = TMP_SALLOC_LIMBS (30 * n + 6); */ + + /********************** evaluation and recursive calls *********************/ + /* $\pm1/8$ */ + mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 3, pp); + /* A(-1/8)*B(-1/8)*8^. */ /* A(+1/8)*B(+1/8)*8^. */ + TOOM8_SQR_REC(pp, v0, 2, r7, v2, n + 1, wse); + mpn_toom_couple_handling (r7, 2 * n + 1 + BIT_CORRECTION, pp, 0, n, 3, 0); + + /* $\pm1/4$ */ + mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 2, pp); + /* A(-1/4)*B(-1/4)*4^. */ /* A(+1/4)*B(+1/4)*4^. */ + TOOM8_SQR_REC(pp, v0, 2, r5, v2, n + 1, wse); + mpn_toom_couple_handling (r5, 2 * n + 1, pp, 0, n, 2, 0); + + /* $\pm2$ */ + mpn_toom_eval_pm2 (v2, v0, 7, ap, n, s, pp); + /* A(-2)*B(-2) */ /* A(+2)*B(+2) */ + TOOM8_SQR_REC(pp, v0, 2, r3, v2, n + 1, wse); + mpn_toom_couple_handling (r3, 2 * n + 1, pp, 0, n, 1, 2); + + /* $\pm8$ */ + mpn_toom_eval_pm2exp (v2, v0, 7, ap, n, s, 3, pp); + /* A(-8)*B(-8) */ /* A(+8)*B(+8) */ + TOOM8_SQR_REC(pp, v0, 2, r1, v2, n + 1, wse); + mpn_toom_couple_handling (r1, 2 * n + 1 + BIT_CORRECTION, pp, 0, n, 3, 6); + + /* $\pm1/2$ */ + mpn_toom_eval_pm2rexp (v2, v0, 7, ap, n, s, 1, pp); + /* A(-1/2)*B(-1/2)*2^. */ /* A(+1/2)*B(+1/2)*2^. */ + TOOM8_SQR_REC(pp, v0, 2, r6, v2, n + 1, wse); + mpn_toom_couple_handling (r6, 2 * n + 1, pp, 0, n, 1, 0); + + /* $\pm1$ */ + mpn_toom_eval_pm1 (v2, v0, 7, ap, n, s, pp); + /* A(-1)*B(-1) */ /* A(1)*B(1) */ + TOOM8_SQR_REC(pp, v0, 2, r4, v2, n + 1, wse); + mpn_toom_couple_handling (r4, 2 * n + 1, pp, 0, n, 0, 0); + + /* $\pm4$ */ + mpn_toom_eval_pm2exp (v2, v0, 7, ap, n, s, 2, pp); + /* A(-4)*B(-4) */ /* A(+4)*B(+4) */ + TOOM8_SQR_REC(pp, v0, 2, r2, v2, n + 1, wse); + mpn_toom_couple_handling (r2, 2 * n + 1, pp, 0, n, 2, 4); + +#undef v0 +#undef v2 + + /* A(0)*B(0) */ + TOOM8_SQR_REC(pp, ap, 0, pp, ap, n, wse); + + mpn_toom_interpolate_16pts (pp, r1, r3, r5, r7, n, 2 * s, 0, wse); + +#undef r0 +#undef r1 +#undef r2 +#undef r3 +#undef r4 +#undef r5 +#undef r6 +#undef wse + +} + +#undef TOOM8_SQR_REC +#undef MAYBE_sqr_basecase +#undef MAYBE_sqr_above_basecase +#undef MAYBE_sqr_toom2 +#undef MAYBE_sqr_above_toom2 +#undef MAYBE_sqr_toom3 +#undef MAYBE_sqr_above_toom3 +#undef MAYBE_sqr_above_toom4 diff --git a/gmp-6.3.0/mpn/generic/toom8h_mul.c b/gmp-6.3.0/mpn/generic/toom8h_mul.c new file mode 100644 index 0000000..5ba259a --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom8h_mul.c @@ -0,0 +1,305 @@ +/* Implementation of the multiplication algorithm for Toom-Cook 8.5-way. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2010, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + +#if GMP_NUMB_BITS < 29 +#error Not implemented. +#endif + +#if GMP_NUMB_BITS < 43 +#define BIT_CORRECTION 1 +#define CORRECTION_BITS GMP_NUMB_BITS +#else +#define BIT_CORRECTION 0 +#define CORRECTION_BITS 0 +#endif + + +#if TUNE_PROGRAM_BUILD +#define MAYBE_mul_basecase 1 +#define MAYBE_mul_toom22 1 +#define MAYBE_mul_toom33 1 +#define MAYBE_mul_toom44 1 +#define MAYBE_mul_toom8h 1 +#else +#define MAYBE_mul_basecase \ + (MUL_TOOM8H_THRESHOLD < 8 * MUL_TOOM22_THRESHOLD) +#define MAYBE_mul_toom22 \ + (MUL_TOOM8H_THRESHOLD < 8 * MUL_TOOM33_THRESHOLD) +#define MAYBE_mul_toom33 \ + (MUL_TOOM8H_THRESHOLD < 8 * MUL_TOOM44_THRESHOLD) +#define MAYBE_mul_toom44 \ + (MUL_TOOM8H_THRESHOLD < 8 * MUL_TOOM6H_THRESHOLD) +#define MAYBE_mul_toom8h \ + (MUL_FFT_THRESHOLD >= 8 * MUL_TOOM8H_THRESHOLD) +#endif + +#define TOOM8H_MUL_N_REC(p, a, b, f, p2, a2, b2, n, ws) \ + do { \ + if (MAYBE_mul_basecase \ + && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD)) { \ + mpn_mul_basecase (p, a, n, b, n); \ + if (f) mpn_mul_basecase (p2, a2, n, b2, n); \ + } else if (MAYBE_mul_toom22 \ + && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD)) { \ + mpn_toom22_mul (p, a, n, b, n, ws); \ + if (f) mpn_toom22_mul (p2, a2, n, b2, n, ws); \ + } else if (MAYBE_mul_toom33 \ + && BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD)) { \ + mpn_toom33_mul (p, a, n, b, n, ws); \ + if (f) mpn_toom33_mul (p2, a2, n, b2, n, ws); \ + } else if (MAYBE_mul_toom44 \ + && BELOW_THRESHOLD (n, MUL_TOOM6H_THRESHOLD)) { \ + mpn_toom44_mul (p, a, n, b, n, ws); \ + if (f) mpn_toom44_mul (p2, a2, n, b2, n, ws); \ + } else if (! MAYBE_mul_toom8h \ + || BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD)) { \ + mpn_toom6h_mul (p, a, n, b, n, ws); \ + if (f) mpn_toom6h_mul (p2, a2, n, b2, n, ws); \ + } else { \ + mpn_toom8h_mul (p, a, n, b, n, ws); \ + if (f) mpn_toom8h_mul (p2, a2, n, b2, n, ws); \ + } \ + } while (0) + +#define TOOM8H_MUL_REC(p, a, na, b, nb, ws) \ + do { mpn_mul (p, a, na, b, nb); } while (0) + +/* Toom-8.5 , compute the product {pp,an+bn} <- {ap,an} * {bp,bn} + With: an >= bn >= 86, an*5 < bn * 11. + It _may_ work with bn<=?? and bn*?? < an*? < bn*?? + + Evaluate in: infinity, +8,-8,+4,-4,+2,-2,+1,-1,+1/2,-1/2,+1/4,-1/4,+1/8,-1/8,0. +*/ +/* Estimate on needed scratch: + S(n) <= (n+7)\8*13+5+MAX(S((n+7)\8),1+2*(n+7)\8), + since n>80; S(n) <= ceil(log(n/10)/log(8))*(13+5)+n*15\8 < n*15\8 + lg2(n)*6 + */ + +void +mpn_toom8h_mul (mp_ptr pp, + mp_srcptr ap, mp_size_t an, + mp_srcptr bp, mp_size_t bn, mp_ptr scratch) +{ + mp_size_t n, s, t; + int p, q, half; + int sign; + + /***************************** decomposition *******************************/ + + ASSERT (an >= bn); + /* Can not handle too small operands */ + ASSERT (bn >= 86); + /* Can not handle too much unbalancement */ + ASSERT (an <= bn*4); + ASSERT (GMP_NUMB_BITS > 11*3 || an*4 <= bn*11); + ASSERT (GMP_NUMB_BITS > 10*3 || an*1 <= bn* 2); + ASSERT (GMP_NUMB_BITS > 9*3 || an*2 <= bn* 3); + + /* Limit num/den is a rational number between + (16/15)^(log(6)/log(2*6-1)) and (16/15)^(log(8)/log(2*8-1)) */ +#define LIMIT_numerator (21) +#define LIMIT_denominat (20) + + if (LIKELY (an == bn) || an * (LIMIT_denominat>>1) < LIMIT_numerator * (bn>>1) ) /* is 8*... < 8*... */ + { + half = 0; + n = 1 + ((an - 1)>>3); + p = q = 7; + s = an - 7 * n; + t = bn - 7 * n; + } + else + { + if (an * 13 < 16 * bn) /* (an*7*LIMIT_numerator>1) < (LIMIT_numerator/7*9) * (bn>>1)) + { p = 9; q = 7; } + else if (an * 10 < 33 * (bn>>1)) /* (an*3*LIMIT_numerator= p * bn ? (an - 1) / (size_t) p : (bn - 1) / (size_t) q); + p--; q--; + + s = an - p * n; + t = bn - q * n; + + if(half) { /* Recover from badly chosen splitting */ + if (UNLIKELY (s<1)) {p--; s+=n; half=0;} + else if (UNLIKELY (t<1)) {q--; t+=n; half=0;} + } + } +#undef LIMIT_numerator +#undef LIMIT_denominat + + ASSERT (0 < s && s <= n); + ASSERT (0 < t && t <= n); + ASSERT (half || s + t > 3); + ASSERT (n > 2); + +#define r6 (pp + 3 * n) /* 3n+1 */ +#define r4 (pp + 7 * n) /* 3n+1 */ +#define r2 (pp +11 * n) /* 3n+1 */ +#define r0 (pp +15 * n) /* s+t <= 2*n */ +#define r7 (scratch) /* 3n+1 */ +#define r5 (scratch + 3 * n + 1) /* 3n+1 */ +#define r3 (scratch + 6 * n + 2) /* 3n+1 */ +#define r1 (scratch + 9 * n + 3) /* 3n+1 */ +#define v0 (pp +11 * n) /* n+1 */ +#define v1 (pp +12 * n+1) /* n+1 */ +#define v2 (pp +13 * n+2) /* n+1 */ +#define v3 (scratch +12 * n + 4) /* n+1 */ +#define wsi (scratch +12 * n + 4) /* 3n+1 */ +#define wse (scratch +13 * n + 5) /* 2n+1 */ + + /* Alloc also 3n+1 limbs for wsi... toom_interpolate_16pts may + need all of them */ +/* if (scratch == NULL) */ +/* scratch = TMP_SALLOC_LIMBS(mpn_toom8_sqr_itch(n * 8)); */ + ASSERT (15 * n + 6 <= mpn_toom8h_mul_itch (an, bn)); + ASSERT (15 * n + 6 <= mpn_toom8_sqr_itch (n * 8)); + + /********************** evaluation and recursive calls *********************/ + + /* $\pm1/8$ */ + sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 3, pp) ^ + mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 3, pp); + /* A(-1/8)*B(-1/8)*8^. */ /* A(+1/8)*B(+1/8)*8^. */ + TOOM8H_MUL_N_REC(pp, v0, v1, 2, r7, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r7, 2 * n + 1 + BIT_CORRECTION, pp, sign, n, 3*(1+half), 3*(half)); + + /* $\pm1/4$ */ + sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 2, pp) ^ + mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 2, pp); + /* A(-1/4)*B(-1/4)*4^. */ /* A(+1/4)*B(+1/4)*4^. */ + TOOM8H_MUL_N_REC(pp, v0, v1, 2, r5, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r5, 2 * n + 1, pp, sign, n, 2*(1+half), 2*(half)); + + /* $\pm2$ */ + sign = mpn_toom_eval_pm2 (v2, v0, p, ap, n, s, pp) ^ + mpn_toom_eval_pm2 (v3, v1, q, bp, n, t, pp); + /* A(-2)*B(-2) */ /* A(+2)*B(+2) */ + TOOM8H_MUL_N_REC(pp, v0, v1, 2, r3, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r3, 2 * n + 1, pp, sign, n, 1, 2); + + /* $\pm8$ */ + sign = mpn_toom_eval_pm2exp (v2, v0, p, ap, n, s, 3, pp) ^ + mpn_toom_eval_pm2exp (v3, v1, q, bp, n, t, 3, pp); + /* A(-8)*B(-8) */ /* A(+8)*B(+8) */ + TOOM8H_MUL_N_REC(pp, v0, v1, 2, r1, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r1, 2 * n + 1 + BIT_CORRECTION, pp, sign, n, 3, 6); + + /* $\pm1/2$ */ + sign = mpn_toom_eval_pm2rexp (v2, v0, p, ap, n, s, 1, pp) ^ + mpn_toom_eval_pm2rexp (v3, v1, q, bp, n, t, 1, pp); + /* A(-1/2)*B(-1/2)*2^. */ /* A(+1/2)*B(+1/2)*2^. */ + TOOM8H_MUL_N_REC(pp, v0, v1, 2, r6, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r6, 2 * n + 1, pp, sign, n, 1+half, half); + + /* $\pm1$ */ + sign = mpn_toom_eval_pm1 (v2, v0, p, ap, n, s, pp); + if (GMP_NUMB_BITS > 12*3 && UNLIKELY (q == 3)) + sign ^= mpn_toom_eval_dgr3_pm1 (v3, v1, bp, n, t, pp); + else + sign ^= mpn_toom_eval_pm1 (v3, v1, q, bp, n, t, pp); + /* A(-1)*B(-1) */ /* A(1)*B(1) */ + TOOM8H_MUL_N_REC(pp, v0, v1, 2, r4, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r4, 2 * n + 1, pp, sign, n, 0, 0); + + /* $\pm4$ */ + sign = mpn_toom_eval_pm2exp (v2, v0, p, ap, n, s, 2, pp) ^ + mpn_toom_eval_pm2exp (v3, v1, q, bp, n, t, 2, pp); + /* A(-4)*B(-4) */ /* A(+4)*B(+4) */ + TOOM8H_MUL_N_REC(pp, v0, v1, 2, r2, v2, v3, n + 1, wse); + mpn_toom_couple_handling (r2, 2 * n + 1, pp, sign, n, 2, 4); + +#undef v0 +#undef v1 +#undef v2 +#undef v3 +#undef wse + + /* A(0)*B(0) */ + TOOM8H_MUL_N_REC(pp, ap, bp, 0, pp, ap, bp, n, wsi); + + /* Infinity */ + if (UNLIKELY (half != 0)) { + if (s > t) { + TOOM8H_MUL_REC(r0, ap + p * n, s, bp + q * n, t, wsi); + } else { + TOOM8H_MUL_REC(r0, bp + q * n, t, ap + p * n, s, wsi); + }; + }; + + mpn_toom_interpolate_16pts (pp, r1, r3, r5, r7, n, s+t, half, wsi); + +#undef r0 +#undef r1 +#undef r2 +#undef r3 +#undef r4 +#undef r5 +#undef r6 +#undef wsi +} + +#undef TOOM8H_MUL_N_REC +#undef TOOM8H_MUL_REC +#undef MAYBE_mul_basecase +#undef MAYBE_mul_toom22 +#undef MAYBE_mul_toom33 +#undef MAYBE_mul_toom44 +#undef MAYBE_mul_toom8h diff --git a/gmp-6.3.0/mpn/generic/toom_couple_handling.c b/gmp-6.3.0/mpn/generic/toom_couple_handling.c new file mode 100644 index 0000000..cd253f7 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_couple_handling.c @@ -0,0 +1,80 @@ +/* Helper function for high degree Toom-Cook algorithms. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2010 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Gets {pp,n} and (sign?-1:1)*{np,n}. Computes at once: + {pp,n} <- ({pp,n}+{np,n})/2^{ps+1} + {pn,n} <- ({pp,n}-{np,n})/2^{ns+1} + Finally recompose them obtaining: + {pp,n+off} <- {pp,n}+{np,n}*2^{off*GMP_NUMB_BITS} +*/ +void +mpn_toom_couple_handling (mp_ptr pp, mp_size_t n, mp_ptr np, + int nsign, mp_size_t off, int ps, int ns) +{ + if (nsign) { +#ifdef HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (np, pp, np, n); +#else + mpn_sub_n (np, pp, np, n); + mpn_rshift (np, np, n, 1); +#endif + } else { +#ifdef HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (np, pp, np, n); +#else + mpn_add_n (np, pp, np, n); + mpn_rshift (np, np, n, 1); +#endif + } + +#ifdef HAVE_NATIVE_mpn_rsh1sub_n + if (ps == 1) + mpn_rsh1sub_n (pp, pp, np, n); + else +#endif + { + mpn_sub_n (pp, pp, np, n); + if (ps > 0) + mpn_rshift (pp, pp, n, ps); + } + if (ns > 0) + mpn_rshift (np, np, n, ns); + pp[n] = mpn_add_n (pp+off, pp+off, np, n-off); + ASSERT_NOCARRY (mpn_add_1(pp+n, np+n-off, off, pp[n]) ); +} diff --git a/gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm1.c b/gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm1.c new file mode 100644 index 0000000..5f491b6 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm1.c @@ -0,0 +1,72 @@ +/* mpn_toom_eval_dgr3_pm1 -- Evaluate a degree 3 polynomial in +1 and -1 + + Contributed to the GNU project by Niels Möller + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +int +mpn_toom_eval_dgr3_pm1 (mp_ptr xp1, mp_ptr xm1, + mp_srcptr xp, mp_size_t n, mp_size_t x3n, mp_ptr tp) +{ + int neg; + + ASSERT (x3n > 0); + ASSERT (x3n <= n); + + xp1[n] = mpn_add_n (xp1, xp, xp + 2*n, n); + tp[n] = mpn_add (tp, xp + n, n, xp + 3*n, x3n); + + neg = (mpn_cmp (xp1, tp, n + 1) < 0) ? ~0 : 0; + +#if HAVE_NATIVE_mpn_add_n_sub_n + if (neg) + mpn_add_n_sub_n (xp1, xm1, tp, xp1, n + 1); + else + mpn_add_n_sub_n (xp1, xm1, xp1, tp, n + 1); +#else + if (neg) + mpn_sub_n (xm1, tp, xp1, n + 1); + else + mpn_sub_n (xm1, xp1, tp, n + 1); + + mpn_add_n (xp1, xp1, tp, n + 1); +#endif + + ASSERT (xp1[n] <= 3); + ASSERT (xm1[n] <= 1); + + return neg; +} diff --git a/gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm2.c b/gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm2.c new file mode 100644 index 0000000..55e6b89 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_eval_dgr3_pm2.c @@ -0,0 +1,97 @@ +/* mpn_toom_eval_dgr3_pm2 -- Evaluate a degree 3 polynomial in +2 and -2 + + Contributed to the GNU project by Niels Möller + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Needs n+1 limbs of temporary storage. */ +int +mpn_toom_eval_dgr3_pm2 (mp_ptr xp2, mp_ptr xm2, + mp_srcptr xp, mp_size_t n, mp_size_t x3n, mp_ptr tp) +{ + mp_limb_t cy; + int neg; + + ASSERT (x3n > 0); + ASSERT (x3n <= n); + + /* (x0 + 4 * x2) +/- (2 x1 + 8 x_3) */ +#if HAVE_NATIVE_mpn_addlsh_n || HAVE_NATIVE_mpn_addlsh2_n +#if HAVE_NATIVE_mpn_addlsh2_n + xp2[n] = mpn_addlsh2_n (xp2, xp, xp + 2*n, n); + + cy = mpn_addlsh2_n (tp, xp + n, xp + 3*n, x3n); +#else /* HAVE_NATIVE_mpn_addlsh_n */ + xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2); + + cy = mpn_addlsh_n (tp, xp + n, xp + 3*n, x3n, 2); +#endif + if (x3n < n) + cy = mpn_add_1 (tp + x3n, xp + n + x3n, n - x3n, cy); + tp[n] = cy; +#else + cy = mpn_lshift (tp, xp + 2*n, n, 2); + xp2[n] = cy + mpn_add_n (xp2, tp, xp, n); + + tp[x3n] = mpn_lshift (tp, xp + 3*n, x3n, 2); + if (x3n < n) + tp[n] = mpn_add (tp, xp + n, n, tp, x3n + 1); + else + tp[n] += mpn_add_n (tp, xp + n, tp, n); +#endif + mpn_lshift (tp, tp, n+1, 1); + + neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0; + +#if HAVE_NATIVE_mpn_add_n_sub_n + if (neg) + mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1); + else + mpn_add_n_sub_n (xp2, xm2, xp2, tp, n + 1); +#else + if (neg) + mpn_sub_n (xm2, tp, xp2, n + 1); + else + mpn_sub_n (xm2, xp2, tp, n + 1); + + mpn_add_n (xp2, xp2, tp, n + 1); +#endif + + ASSERT (xp2[n] < 15); + ASSERT (xm2[n] < 10); + + return neg; +} diff --git a/gmp-6.3.0/mpn/generic/toom_eval_pm1.c b/gmp-6.3.0/mpn/generic/toom_eval_pm1.c new file mode 100644 index 0000000..a8cfa93 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_eval_pm1.c @@ -0,0 +1,89 @@ +/* mpn_toom_eval_pm1 -- Evaluate a polynomial in +1 and -1 + + Contributed to the GNU project by Niels Möller + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluates a polynomial of degree k > 3, in the points +1 and -1. */ +int +mpn_toom_eval_pm1 (mp_ptr xp1, mp_ptr xm1, unsigned k, + mp_srcptr xp, mp_size_t n, mp_size_t hn, mp_ptr tp) +{ + unsigned i; + int neg; + + ASSERT (k >= 4); + + ASSERT (hn > 0); + ASSERT (hn <= n); + + /* The degree k is also the number of full-size coefficients, so + * that last coefficient, of size hn, starts at xp + k*n. */ + + xp1[n] = mpn_add_n (xp1, xp, xp + 2*n, n); + for (i = 4; i < k; i += 2) + ASSERT_NOCARRY (mpn_add (xp1, xp1, n+1, xp+i*n, n)); + + tp[n] = mpn_add_n (tp, xp + n, xp + 3*n, n); + for (i = 5; i < k; i += 2) + ASSERT_NOCARRY (mpn_add (tp, tp, n+1, xp+i*n, n)); + + if (k & 1) + ASSERT_NOCARRY (mpn_add (tp, tp, n+1, xp+k*n, hn)); + else + ASSERT_NOCARRY (mpn_add (xp1, xp1, n+1, xp+k*n, hn)); + + neg = (mpn_cmp (xp1, tp, n + 1) < 0) ? ~0 : 0; + +#if HAVE_NATIVE_mpn_add_n_sub_n + if (neg) + mpn_add_n_sub_n (xp1, xm1, tp, xp1, n + 1); + else + mpn_add_n_sub_n (xp1, xm1, xp1, tp, n + 1); +#else + if (neg) + mpn_sub_n (xm1, tp, xp1, n + 1); + else + mpn_sub_n (xm1, xp1, tp, n + 1); + + mpn_add_n (xp1, xp1, tp, n + 1); +#endif + + ASSERT (xp1[n] <= k); + ASSERT (xm1[n] <= k/2 + 1); + + return neg; +} diff --git a/gmp-6.3.0/mpn/generic/toom_eval_pm2.c b/gmp-6.3.0/mpn/generic/toom_eval_pm2.c new file mode 100644 index 0000000..be682c7 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_eval_pm2.c @@ -0,0 +1,130 @@ +/* mpn_toom_eval_pm2 -- Evaluate a polynomial in +2 and -2 + + Contributed to the GNU project by Niels Möller and Marco Bodrato + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +/* DO_addlsh2(d,a,b,n,cy) computes cy,{d,n} <- {a,n} + 4*(cy,{b,n}), it + can be used as DO_addlsh2(d,a,d,n,d[n]), for accumulation on {d,n+1}. */ +#if HAVE_NATIVE_mpn_addlsh2_n +#define DO_addlsh2(d, a, b, n, cy) \ +do { \ + (cy) <<= 2; \ + (cy) += mpn_addlsh2_n(d, a, b, n); \ +} while (0) +#else +#if HAVE_NATIVE_mpn_addlsh_n +#define DO_addlsh2(d, a, b, n, cy) \ +do { \ + (cy) <<= 2; \ + (cy) += mpn_addlsh_n(d, a, b, n, 2); \ +} while (0) +#else +/* The following is not a general substitute for addlsh2. + It is correct if d == b, but it is not if d == a. */ +#define DO_addlsh2(d, a, b, n, cy) \ +do { \ + (cy) <<= 2; \ + (cy) += mpn_lshift(d, b, n, 2); \ + (cy) += mpn_add_n(d, d, a, n); \ +} while (0) +#endif +#endif + +/* Evaluates a polynomial of degree 2 < k < GMP_NUMB_BITS, in the + points +2 and -2. */ +int +mpn_toom_eval_pm2 (mp_ptr xp2, mp_ptr xm2, unsigned k, + mp_srcptr xp, mp_size_t n, mp_size_t hn, mp_ptr tp) +{ + int i; + int neg; + mp_limb_t cy; + + ASSERT (k >= 3); + ASSERT (k < GMP_NUMB_BITS); + + ASSERT (hn > 0); + ASSERT (hn <= n); + + /* The degree k is also the number of full-size coefficients, so + * that last coefficient, of size hn, starts at xp + k*n. */ + + cy = 0; + DO_addlsh2 (xp2, xp + (k-2) * n, xp + k * n, hn, cy); + if (hn != n) + cy = mpn_add_1 (xp2 + hn, xp + (k-2) * n + hn, n - hn, cy); + for (i = k - 4; i >= 0; i -= 2) + DO_addlsh2 (xp2, xp + i * n, xp2, n, cy); + xp2[n] = cy; + + k--; + + cy = 0; + DO_addlsh2 (tp, xp + (k-2) * n, xp + k * n, n, cy); + for (i = k - 4; i >= 0; i -= 2) + DO_addlsh2 (tp, xp + i * n, tp, n, cy); + tp[n] = cy; + + if (k & 1) + ASSERT_NOCARRY(mpn_lshift (tp , tp , n + 1, 1)); + else + ASSERT_NOCARRY(mpn_lshift (xp2, xp2, n + 1, 1)); + + neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0; + +#if HAVE_NATIVE_mpn_add_n_sub_n + if (neg) + mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1); + else + mpn_add_n_sub_n (xp2, xm2, xp2, tp, n + 1); +#else /* !HAVE_NATIVE_mpn_add_n_sub_n */ + if (neg) + mpn_sub_n (xm2, tp, xp2, n + 1); + else + mpn_sub_n (xm2, xp2, tp, n + 1); + + mpn_add_n (xp2, xp2, tp, n + 1); +#endif /* !HAVE_NATIVE_mpn_add_n_sub_n */ + + ASSERT (xp2[n] < (1<<(k+2))-1); + ASSERT (xm2[n] < ((1<<(k+3))-1 - (1^k&1))/3); + + neg ^= ((k & 1) - 1); + + return neg; +} + +#undef DO_addlsh2 diff --git a/gmp-6.3.0/mpn/generic/toom_eval_pm2exp.c b/gmp-6.3.0/mpn/generic/toom_eval_pm2exp.c new file mode 100644 index 0000000..c3c4651 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_eval_pm2exp.c @@ -0,0 +1,127 @@ +/* mpn_toom_eval_pm2exp -- Evaluate a polynomial in +2^k and -2^k + + Contributed to the GNU project by Niels Möller + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + +/* Evaluates a polynomial of degree k > 2, in the points +2^shift and -2^shift. */ +int +mpn_toom_eval_pm2exp (mp_ptr xp2, mp_ptr xm2, unsigned k, + mp_srcptr xp, mp_size_t n, mp_size_t hn, unsigned shift, + mp_ptr tp) +{ + unsigned i; + int neg; +#if HAVE_NATIVE_mpn_addlsh_n + mp_limb_t cy; +#endif + + ASSERT (k >= 3); + ASSERT (shift*k < GMP_NUMB_BITS); + + ASSERT (hn > 0); + ASSERT (hn <= n); + + /* The degree k is also the number of full-size coefficients, so + * that last coefficient, of size hn, starts at xp + k*n. */ + +#if HAVE_NATIVE_mpn_addlsh_n + xp2[n] = mpn_addlsh_n (xp2, xp, xp + 2*n, n, 2*shift); + for (i = 4; i < k; i += 2) + xp2[n] += mpn_addlsh_n (xp2, xp2, xp + i*n, n, i*shift); + + tp[n] = mpn_lshift (tp, xp+n, n, shift); + for (i = 3; i < k; i+= 2) + tp[n] += mpn_addlsh_n (tp, tp, xp+i*n, n, i*shift); + + if (k & 1) + { + cy = mpn_addlsh_n (tp, tp, xp+k*n, hn, k*shift); + MPN_INCR_U (tp + hn, n+1 - hn, cy); + } + else + { + cy = mpn_addlsh_n (xp2, xp2, xp+k*n, hn, k*shift); + MPN_INCR_U (xp2 + hn, n+1 - hn, cy); + } + +#else /* !HAVE_NATIVE_mpn_addlsh_n */ + xp2[n] = mpn_lshift (tp, xp+2*n, n, 2*shift); + xp2[n] += mpn_add_n (xp2, xp, tp, n); + for (i = 4; i < k; i += 2) + { + xp2[n] += mpn_lshift (tp, xp + i*n, n, i*shift); + xp2[n] += mpn_add_n (xp2, xp2, tp, n); + } + + tp[n] = mpn_lshift (tp, xp+n, n, shift); + for (i = 3; i < k; i+= 2) + { + tp[n] += mpn_lshift (xm2, xp + i*n, n, i*shift); + tp[n] += mpn_add_n (tp, tp, xm2, n); + } + + xm2[hn] = mpn_lshift (xm2, xp + k*n, hn, k*shift); + if (k & 1) + mpn_add (tp, tp, n+1, xm2, hn+1); + else + mpn_add (xp2, xp2, n+1, xm2, hn+1); +#endif /* !HAVE_NATIVE_mpn_addlsh_n */ + + neg = (mpn_cmp (xp2, tp, n + 1) < 0) ? ~0 : 0; + +#if HAVE_NATIVE_mpn_add_n_sub_n + if (neg) + mpn_add_n_sub_n (xp2, xm2, tp, xp2, n + 1); + else + mpn_add_n_sub_n (xp2, xm2, xp2, tp, n + 1); +#else /* !HAVE_NATIVE_mpn_add_n_sub_n */ + if (neg) + mpn_sub_n (xm2, tp, xp2, n + 1); + else + mpn_sub_n (xm2, xp2, tp, n + 1); + + mpn_add_n (xp2, xp2, tp, n + 1); +#endif /* !HAVE_NATIVE_mpn_add_n_sub_n */ + + /* FIXME: the following asserts are useless if (k+1)*shift >= GMP_LIMB_BITS */ + ASSERT ((k+1)*shift >= GMP_LIMB_BITS || + xp2[n] < ((CNST_LIMB(1)<<((k+1)*shift))-1)/((CNST_LIMB(1)<= GMP_LIMB_BITS || + xm2[n] < ((CNST_LIMB(1)<<((k+2)*shift))-((k&1)?(CNST_LIMB(1)<= 3. */ +int +mpn_toom_eval_pm2rexp (mp_ptr rp, mp_ptr rm, + unsigned int q, mp_srcptr ap, mp_size_t n, mp_size_t t, + unsigned int s, mp_ptr ws) +{ + unsigned int i; + int neg; + /* {ap,q*n+t} -> {rp,n+1} {rm,n+1} , with {ws, n+1}*/ + ASSERT (n >= t); + ASSERT (s != 0); /* or _eval_pm1 should be used */ + ASSERT (q > 1); + ASSERT (s*q < GMP_NUMB_BITS); + rp[n] = mpn_lshift(rp, ap, n, s*q); + ws[n] = mpn_lshift(ws, ap+n, n, s*(q-1)); + if( (q & 1) != 0) { + ASSERT_NOCARRY(mpn_add(ws,ws,n+1,ap+n*q,t)); + rp[n] += DO_mpn_addlsh_n(rp, ap+n*(q-1), n, s, rm); + } else { + ASSERT_NOCARRY(mpn_add(rp,rp,n+1,ap+n*q,t)); + } + for(i=2; i> s); \ + __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \ + MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \ +} while (0) +#endif + + +#define BINVERT_9 \ + ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39) + +#define BINVERT_255 \ + (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8))) + + /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */ +#if GMP_LIMB_BITS == 32 +#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B)) +#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35)) +#else +#if GMP_LIMB_BITS == 64 +#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B)) +#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35)) +#endif +#endif + +#ifndef mpn_divexact_by255 +#if GMP_NUMB_BITS % 8 == 0 +#define mpn_divexact_by255(dst,src,size) \ + (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255))) +#else +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0) +#else +#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)) +#endif +#endif +#endif + +#ifndef mpn_divexact_by9x4 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2) +#else +#define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2) +#endif +#endif + +#ifndef mpn_divexact_by42525 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525) +#define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0) +#else +#define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)) +#endif +#endif + +#ifndef mpn_divexact_by2835x4 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835) +#define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2) +#else +#define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2) +#endif +#endif + +/* Interpolation for Toom-6.5 (or Toom-6), using the evaluation + points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely, + we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of + degree 11 (or 10), given the 12 (rsp. 11) values: + + r0 = limit at infinity of f(x) / x^11, + r1 = f(4),f(-4), + r2 = f(2),f(-2), + r3 = f(1),f(-1), + r4 = f(1/4),f(-1/4), + r5 = f(1/2),f(-1/2), + r6 = f(0). + + All couples of the form f(n),f(-n) must be already mixed with + toom_couple_handling(f(n),...,f(-n),...) + + The result is stored in {pp, spt + 7*n (or 6*n)}. + At entry, r6 is stored at {pp, 2n}, + r4 is stored at {pp + 3n, 3n + 1}. + r2 is stored at {pp + 7n, 3n + 1}. + r0 is stored at {pp +11n, spt}. + + The other values are 3n+1 limbs each (with most significant limbs small). + + Negative intermediate results are stored two-complemented. + Inputs are destroyed. +*/ + +void +mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, + mp_size_t n, mp_size_t spt, int half, mp_ptr wsi) +{ + mp_limb_t cy; + mp_size_t n3; + mp_size_t n3p1; + n3 = 3 * n; + n3p1 = n3 + 1; + +#define r4 (pp + n3) /* 3n+1 */ +#define r2 (pp + 7 * n) /* 3n+1 */ +#define r0 (pp +11 * n) /* s+t <= 2*n */ + + /******************************* interpolation *****************************/ + if (half != 0) { + cy = mpn_sub_n (r3, r3, r0, spt); + MPN_DECR_U (r3 + spt, n3p1 - spt, cy); + + cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi); + MPN_DECR_U (r2 + spt, n3p1 - spt, cy); + DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi); + + cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi); + MPN_DECR_U (r1 + spt, n3p1 - spt, cy); + DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi); + }; + + r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi); + DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi); + +#if HAVE_NATIVE_mpn_add_n_sub_n + mpn_add_n_sub_n (r1, r4, r4, r1, n3p1); +#else + ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1)); + mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */ + MP_PTR_SWAP(r1, wsi); +#endif + + r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi); + DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi); + +#if HAVE_NATIVE_mpn_add_n_sub_n + mpn_add_n_sub_n (r2, r5, r5, r2, n3p1); +#else + mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */ + ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1)); + MP_PTR_SWAP(r5, wsi); +#endif + + r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n); + +#if AORSMUL_FASTER_AORS_AORSLSH + mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */ +#else + mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */ + DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */ +#endif + /* A division by 2835x4 follows. Warning: the operand can be negative! */ + mpn_divexact_by2835x4(r4, r4, n3p1); + if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0) + r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2)); + +#if AORSMUL_FASTER_2AORSLSH + mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */ +#else + DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */ + DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */ +#endif + mpn_divexact_by255(r5, r5, n3p1); + + ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi)); + +#if AORSMUL_FASTER_3AORSLSH + ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100)); +#else + ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi)); + ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi)); + ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi)); +#endif + ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi)); + mpn_divexact_by42525(r1, r1, n3p1); + +#if AORSMUL_FASTER_AORS_2AORSLSH + ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225)); +#else + ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1)); + ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi)); + ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi)); +#endif + mpn_divexact_by9x4(r2, r2, n3p1); + + ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1)); + +#ifdef HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (r4, r2, r4, n3p1); + r4 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; +#else + mpn_sub_n (r4, r2, r4, n3p1); + ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1)); +#endif + ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1)); + +#ifdef HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (r5, r5, r1, n3p1); + r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; +#else + mpn_add_n (r5, r5, r1, n3p1); + ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1)); +#endif + + /* last interpolation steps... */ + ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1)); + ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1)); + /* ... could be mixed with recomposition + ||H-r5|M-r5|L-r5| ||H-r1|M-r1|L-r1| + */ + + /***************************** recomposition *******************************/ + /* + pp[] prior to operations: + |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp + + summation scheme for remaining operations: + |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp + |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp + ||H r1|M r1|L r1| ||H r3|M r3|L r3| ||H_r5|M_r5|L_r5| + */ + + cy = mpn_add_n (pp + n, pp + n, r5, n); + cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy); +#if HAVE_NATIVE_mpn_add_nc + cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy); +#else + MPN_INCR_U (r5 + 2 * n, n + 1, cy); + cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n); +#endif + MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy); + + pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n); + cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]); +#if HAVE_NATIVE_mpn_add_nc + cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy); +#else + MPN_INCR_U (r3 + 2 * n, n + 1, cy); + cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n); +#endif + MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy); + + pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n); + if (half) { + cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]); +#if HAVE_NATIVE_mpn_add_nc + if (LIKELY (spt > n)) { + cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy); + MPN_INCR_U (pp + 4 * n3, spt - n, cy); + } else { + ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy)); + } +#else + MPN_INCR_U (r1 + 2 * n, n + 1, cy); + if (LIKELY (spt > n)) { + cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n); + MPN_INCR_U (pp + 4 * n3, spt - n, cy); + } else { + ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt)); + } +#endif + } else { + ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n])); + } + +#undef r0 +#undef r2 +#undef r4 +} diff --git a/gmp-6.3.0/mpn/generic/toom_interpolate_16pts.c b/gmp-6.3.0/mpn/generic/toom_interpolate_16pts.c new file mode 100644 index 0000000..c1457be --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_interpolate_16pts.c @@ -0,0 +1,545 @@ +/* Interpolation for the algorithm Toom-Cook 8.5-way. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2010, 2012, 2015, 2020 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + + +#include "gmp-impl.h" + + +#if GMP_NUMB_BITS < 29 +#error Not implemented: Both sublsh_n(,,,28) should be corrected; r2 and r5 need one more LIMB. +#endif + +#if GMP_NUMB_BITS < 28 +#error Not implemented: divexact_by188513325 and _by182712915 will not work. +#endif + + +/* FIXME: tuneup should decide the best variant */ +#ifndef AORSMUL_FASTER_AORS_AORSLSH +#define AORSMUL_FASTER_AORS_AORSLSH 1 +#endif +#ifndef AORSMUL_FASTER_AORS_2AORSLSH +#define AORSMUL_FASTER_AORS_2AORSLSH 1 +#endif +#ifndef AORSMUL_FASTER_2AORSLSH +#define AORSMUL_FASTER_2AORSLSH 1 +#endif +#ifndef AORSMUL_FASTER_3AORSLSH +#define AORSMUL_FASTER_3AORSLSH 1 +#endif + + +#if HAVE_NATIVE_mpn_sublsh_n +#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s) +#else +static mp_limb_t +DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws) +{ +#if USE_MUL_1 && 0 + return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s)); +#else + mp_limb_t __cy; + __cy = mpn_lshift(ws,src,n,s); + return __cy + mpn_sub_n(dst,dst,ws,n); +#endif +} +#endif + +#if HAVE_NATIVE_mpn_addlsh_n +#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s) +#else +#if !defined (AORSMUL_FASTER_2AORSLSH) && !defined (AORSMUL_FASTER_AORS_2AORSLSH) +static mp_limb_t +DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws) +{ +#if USE_MUL_1 && 0 + return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s)); +#else + mp_limb_t __cy; + __cy = mpn_lshift(ws,src,n,s); + return __cy + mpn_add_n(dst,dst,ws,n); +#endif +} +#endif +#endif + +#if HAVE_NATIVE_mpn_subrsh +#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s) +#else +/* FIXME: This is not a correct definition, it assumes no carry */ +#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \ +do { \ + mp_limb_t __cy; \ + MPN_DECR_U (dst, nd, src[0] >> s); \ + __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \ + MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \ +} while (0) +#endif + + +#if GMP_NUMB_BITS < 43 +#define BIT_CORRECTION 1 +#define CORRECTION_BITS GMP_NUMB_BITS +#else +#define BIT_CORRECTION 0 +#define CORRECTION_BITS 0 +#endif + +#define BINVERT_9 \ + ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39) + +#define BINVERT_255 \ + (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8))) + + /* FIXME: find some more general expressions for inverses */ +#if GMP_LIMB_BITS == 32 +#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B)) +#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35)) +#define BINVERT_182712915 (GMP_NUMB_MASK & CNST_LIMB(0x550659DB)) +#define BINVERT_188513325 (GMP_NUMB_MASK & CNST_LIMB(0xFBC333A5)) +#define BINVERT_255x182712915L (GMP_NUMB_MASK & CNST_LIMB(0x6FC4CB25)) +#define BINVERT_255x188513325L (GMP_NUMB_MASK & CNST_LIMB(0x6864275B)) +#if GMP_NAIL_BITS == 0 +#define BINVERT_255x182712915H CNST_LIMB(0x1B649A07) +#define BINVERT_255x188513325H CNST_LIMB(0x06DB993A) +#else /* GMP_NAIL_BITS != 0 */ +#define BINVERT_255x182712915H \ + (GMP_NUMB_MASK & CNST_LIMB((0x1B649A07<>GMP_NUMB_BITS))) +#define BINVERT_255x188513325H \ + (GMP_NUMB_MASK & CNST_LIMB((0x06DB993A<>GMP_NUMB_BITS))) +#endif +#else +#if GMP_LIMB_BITS == 64 +#define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B)) +#define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35)) +#define BINVERT_255x182712915 (GMP_NUMB_MASK & CNST_LIMB(0x1B649A076FC4CB25)) +#define BINVERT_255x188513325 (GMP_NUMB_MASK & CNST_LIMB(0x06DB993A6864275B)) +#endif +#endif + +#ifndef mpn_divexact_by255 +#if GMP_NUMB_BITS % 8 == 0 +#define mpn_divexact_by255(dst,src,size) \ + (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255))) +#else +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0) +#else +#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)) +#endif +#endif +#endif + +#ifndef mpn_divexact_by255x4 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by255x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,2) +#else +#define mpn_divexact_by255x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)<<2) +#endif +#endif + +#ifndef mpn_divexact_by9x16 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by9x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,4) +#else +#define mpn_divexact_by9x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<4) +#endif +#endif + +#ifndef mpn_divexact_by42525x16 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525) +#define mpn_divexact_by42525x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,4) +#else +#define mpn_divexact_by42525x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)<<4) +#endif +#endif + +#ifndef mpn_divexact_by2835x64 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835) +#define mpn_divexact_by2835x64(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,6) +#else +#define mpn_divexact_by2835x64(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<6) +#endif +#endif + +#ifndef mpn_divexact_by255x182712915 +#if GMP_NUMB_BITS < 36 +#if HAVE_NATIVE_mpn_bdiv_q_2_pi2 && defined(BINVERT_255x182712915H) +/* FIXME: use mpn_bdiv_q_2_pi2 */ +#endif +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_182712915) +#define mpn_divexact_by255x182712915(dst,src,size) \ + do { \ + mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(182712915),BINVERT_182712915,0); \ + mpn_divexact_by255(dst,dst,size); \ + } while(0) +#else +#define mpn_divexact_by255x182712915(dst,src,size) \ + do { \ + mpn_divexact_1(dst,src,size,CNST_LIMB(182712915)); \ + mpn_divexact_by255(dst,dst,size); \ + } while(0) +#endif +#else /* GMP_NUMB_BITS > 35 */ +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x182712915) +#define mpn_divexact_by255x182712915(dst,src,size) \ + mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(182712915),BINVERT_255x182712915,0) +#else +#define mpn_divexact_by255x182712915(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(182712915)) +#endif +#endif /* GMP_NUMB_BITS >?< 36 */ +#endif + +#ifndef mpn_divexact_by255x188513325 +#if GMP_NUMB_BITS < 36 +#if HAVE_NATIVE_mpn_bdiv_q_1_pi2 && defined(BINVERT_255x188513325H) +/* FIXME: use mpn_bdiv_q_1_pi2 */ +#endif +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_188513325) +#define mpn_divexact_by255x188513325(dst,src,size) \ + do { \ + mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(188513325),BINVERT_188513325,0); \ + mpn_divexact_by255(dst,dst,size); \ + } while(0) +#else +#define mpn_divexact_by255x188513325(dst,src,size) \ + do { \ + mpn_divexact_1(dst,src,size,CNST_LIMB(188513325)); \ + mpn_divexact_by255(dst,dst,size); \ + } while(0) +#endif +#else /* GMP_NUMB_BITS > 35 */ +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x188513325) +#define mpn_divexact_by255x188513325(dst,src,size) \ + mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(188513325),BINVERT_255x188513325,0) +#else +#define mpn_divexact_by255x188513325(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(188513325)) +#endif +#endif /* GMP_NUMB_BITS >?< 36 */ +#endif + +/* Interpolation for Toom-8.5 (or Toom-8), using the evaluation + points: infinity(8.5 only), +-8, +-4, +-2, +-1, +-1/4, +-1/2, + +-1/8, 0. More precisely, we want to compute + f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 15 (or + 14), given the 16 (rsp. 15) values: + + r0 = limit at infinity of f(x) / x^15, + r1 = f(8),f(-8), + r2 = f(4),f(-4), + r3 = f(2),f(-2), + r4 = f(1),f(-1), + r5 = f(1/4),f(-1/4), + r6 = f(1/2),f(-1/2), + r7 = f(1/8),f(-1/8), + r8 = f(0). + + All couples of the form f(n),f(-n) must be already mixed with + toom_couple_handling(f(n),...,f(-n),...) + + The result is stored in {pp, spt + 7*n (or 8*n)}. + At entry, r8 is stored at {pp, 2n}, + r6 is stored at {pp + 3n, 3n + 1}. + r4 is stored at {pp + 7n, 3n + 1}. + r2 is stored at {pp +11n, 3n + 1}. + r0 is stored at {pp +15n, spt}. + + The other values are 3n+1 limbs each (with most significant limbs small). + + Negative intermediate results are stored two-complemented. + Inputs are destroyed. +*/ + +void +mpn_toom_interpolate_16pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, mp_ptr r7, + mp_size_t n, mp_size_t spt, int half, mp_ptr wsi) +{ + mp_limb_t cy; + mp_size_t n3; + mp_size_t n3p1; + n3 = 3 * n; + n3p1 = n3 + 1; + +#define r6 (pp + n3) /* 3n+1 */ +#define r4 (pp + 7 * n) /* 3n+1 */ +#define r2 (pp +11 * n) /* 3n+1 */ +#define r0 (pp +15 * n) /* s+t <= 2*n */ + + ASSERT( spt <= 2 * n ); + /******************************* interpolation *****************************/ + if( half != 0) { + cy = mpn_sub_n (r4, r4, r0, spt); + MPN_DECR_U (r4 + spt, n3p1 - spt, cy); + + cy = DO_mpn_sublsh_n (r3, r0, spt, 14, wsi); + MPN_DECR_U (r3 + spt, n3p1 - spt, cy); + DO_mpn_subrsh(r6, n3p1, r0, spt, 2, wsi); + + cy = DO_mpn_sublsh_n (r2, r0, spt, 28, wsi); + MPN_DECR_U (r2 + spt, n3p1 - spt, cy); + DO_mpn_subrsh(r5, n3p1, r0, spt, 4, wsi); + + cy = DO_mpn_sublsh_n (r1 + BIT_CORRECTION, r0, spt, 42 - CORRECTION_BITS, wsi); +#if BIT_CORRECTION + cy = mpn_sub_1 (r1 + spt + BIT_CORRECTION, r1 + spt + BIT_CORRECTION, + n3p1 - spt - BIT_CORRECTION, cy); + ASSERT (BIT_CORRECTION > 0 || cy == 0); + /* FIXME: assumes r7[n3p1] is writable (it is if r5 follows). */ + cy = r7[n3p1]; + r7[n3p1] = 0x80; +#else + MPN_DECR_U (r1 + spt + BIT_CORRECTION, n3p1 - spt - BIT_CORRECTION, cy); +#endif + DO_mpn_subrsh(r7, n3p1 + BIT_CORRECTION, r0, spt, 6, wsi); +#if BIT_CORRECTION + /* FIXME: assumes r7[n3p1] is writable. */ + ASSERT ( BIT_CORRECTION > 0 || r7[n3p1] == 0x80 ); + r7[n3p1] = cy; +#endif + }; + + r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 28, wsi); + DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 4, wsi); + +#if HAVE_NATIVE_mpn_add_n_sub_n + mpn_add_n_sub_n (r2, r5, r5, r2, n3p1); +#else + mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */ + ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1)); + MP_PTR_SWAP(r5, wsi); +#endif + + r6[n3] -= DO_mpn_sublsh_n (r6 + n, pp, 2 * n, 14, wsi); + DO_mpn_subrsh(r3 + n, 2 * n + 1, pp, 2 * n, 2, wsi); + +#if HAVE_NATIVE_mpn_add_n_sub_n + mpn_add_n_sub_n (r3, r6, r6, r3, n3p1); +#else + ASSERT_NOCARRY(mpn_add_n (wsi, r3, r6, n3p1)); + mpn_sub_n (r6, r6, r3, n3p1); /* can be negative */ + MP_PTR_SWAP(r3, wsi); +#endif + + cy = DO_mpn_sublsh_n (r7 + n + BIT_CORRECTION, pp, 2 * n, 42 - CORRECTION_BITS, wsi); +#if BIT_CORRECTION + MPN_DECR_U (r1 + n, 2 * n + 1, pp[0] >> 6); + cy = DO_mpn_sublsh_n (r1 + n, pp + 1, 2 * n - 1, GMP_NUMB_BITS - 6, wsi); + cy = mpn_sub_1(r1 + 3 * n - 1, r1 + 3 * n - 1, 2, cy); + ASSERT ( BIT_CORRECTION > 0 || cy != 0 ); +#else + r7[n3] -= cy; + DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 6, wsi); +#endif + +#if HAVE_NATIVE_mpn_add_n_sub_n + mpn_add_n_sub_n (r1, r7, r7, r1, n3p1); +#else + mpn_sub_n (wsi, r7, r1, n3p1); /* can be negative */ + mpn_add_n (r1, r1, r7, n3p1); /* if BIT_CORRECTION != 0, can give a carry. */ + MP_PTR_SWAP(r7, wsi); +#endif + + r4[n3] -= mpn_sub_n (r4+n, r4+n, pp, 2 * n); + +#if AORSMUL_FASTER_2AORSLSH + mpn_submul_1 (r5, r6, n3p1, 1028); /* can be negative */ +#else + DO_mpn_sublsh_n (r5, r6, n3p1, 2, wsi); /* can be negative */ + DO_mpn_sublsh_n (r5, r6, n3p1,10, wsi); /* can be negative */ +#endif + + mpn_submul_1 (r7, r5, n3p1, 1300); /* can be negative */ +#if AORSMUL_FASTER_3AORSLSH + mpn_submul_1 (r7, r6, n3p1, 1052688); /* can be negative */ +#else + DO_mpn_sublsh_n (r7, r6, n3p1, 4, wsi); /* can be negative */ + DO_mpn_sublsh_n (r7, r6, n3p1,12, wsi); /* can be negative */ + DO_mpn_sublsh_n (r7, r6, n3p1,20, wsi); /* can be negative */ +#endif + mpn_divexact_by255x188513325(r7, r7, n3p1); + + mpn_submul_1 (r5, r7, n3p1, 12567555); /* can be negative */ + /* A division by 2835x64 follows. Warning: the operand can be negative! */ + mpn_divexact_by2835x64(r5, r5, n3p1); + if ((r5[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-7))) != 0) + r5[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-6)); + +#if AORSMUL_FASTER_AORS_AORSLSH + mpn_submul_1 (r6, r7, n3p1, 4095); /* can be negative */ +#else + mpn_add_n (r6, r6, r7, n3p1); /* can give a carry */ + DO_mpn_sublsh_n (r6, r7, n3p1, 12, wsi); /* can be negative */ +#endif +#if AORSMUL_FASTER_2AORSLSH + mpn_addmul_1 (r6, r5, n3p1, 240); /* can be negative */ +#else + DO_mpn_addlsh_n (r6, r5, n3p1, 8, wsi); /* can give a carry */ + DO_mpn_sublsh_n (r6, r5, n3p1, 4, wsi); /* can be negative */ +#endif + /* A division by 255x4 follows. Warning: the operand can be negative! */ + mpn_divexact_by255x4(r6, r6, n3p1); + if ((r6[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0) + r6[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2)); + + ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r4, n3p1, 7, wsi)); + + ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r4, n3p1, 13, wsi)); + ASSERT_NOCARRY(mpn_submul_1 (r2, r3, n3p1, 400)); + + /* If GMP_NUMB_BITS < 42 next operations on r1 can give a carry!*/ + DO_mpn_sublsh_n (r1, r4, n3p1, 19, wsi); + mpn_submul_1 (r1, r2, n3p1, 1428); + mpn_submul_1 (r1, r3, n3p1, 112896); + mpn_divexact_by255x182712915(r1, r1, n3p1); + + ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 15181425)); + mpn_divexact_by42525x16(r2, r2, n3p1); + +#if AORSMUL_FASTER_AORS_2AORSLSH + ASSERT_NOCARRY(mpn_submul_1 (r3, r1, n3p1, 3969)); +#else + ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1)); + ASSERT_NOCARRY(DO_mpn_addlsh_n (r3, r1, n3p1, 7, wsi)); + ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r1, n3p1, 12, wsi)); +#endif + ASSERT_NOCARRY(mpn_submul_1 (r3, r2, n3p1, 900)); + mpn_divexact_by9x16(r3, r3, n3p1); + + ASSERT_NOCARRY(mpn_sub_n (r4, r4, r1, n3p1)); + ASSERT_NOCARRY(mpn_sub_n (r4, r4, r3, n3p1)); + ASSERT_NOCARRY(mpn_sub_n (r4, r4, r2, n3p1)); + +#ifdef HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (r6, r2, r6, n3p1); + r6 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; +#else + mpn_add_n (r6, r2, r6, n3p1); + ASSERT_NOCARRY(mpn_rshift(r6, r6, n3p1, 1)); +#endif + ASSERT_NOCARRY(mpn_sub_n (r2, r2, r6, n3p1)); + +#ifdef HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (r5, r3, r5, n3p1); + r5 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; +#else + mpn_sub_n (r5, r3, r5, n3p1); + ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1)); +#endif + ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, n3p1)); + +#ifdef HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (r7, r1, r7, n3p1); + r7 [n3p1 - 1] &= GMP_NUMB_MASK >> 1; +#else + mpn_add_n (r7, r1, r7, n3p1); + ASSERT_NOCARRY(mpn_rshift(r7, r7, n3p1, 1)); +#endif + ASSERT_NOCARRY(mpn_sub_n (r1, r1, r7, n3p1)); + + /* last interpolation steps... */ + /* ... could be mixed with recomposition + ||H-r7|M-r7|L-r7| ||H-r5|M-r5|L-r5| + */ + + /***************************** recomposition *******************************/ + /* + pp[] prior to operations: + |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp + + summation scheme for remaining operations: + |__16|n_15|n_14|n_13|n_12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp + |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp + ||H r1|M r1|L r1| ||H r3|M r3|L r3| ||H_r5|M_r5|L_r5| ||H r7|M r7|L r7| + */ + + cy = mpn_add_n (pp + n, pp + n, r7, n); + cy = mpn_add_1 (pp + 2 * n, r7 + n, n, cy); +#if HAVE_NATIVE_mpn_add_nc + cy = r7[n3] + mpn_add_nc(pp + n3, pp + n3, r7 + 2 * n, n, cy); +#else + MPN_INCR_U (r7 + 2 * n, n + 1, cy); + cy = r7[n3] + mpn_add_n (pp + n3, pp + n3, r7 + 2 * n, n); +#endif + MPN_INCR_U (pp + 4 * n, 2 * n + 1, cy); + + pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r5, n); + cy = mpn_add_1 (pp + 2 * n3, r5 + n, n, pp[2 * n3]); +#if HAVE_NATIVE_mpn_add_nc + cy = r5[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r5 + 2 * n, n, cy); +#else + MPN_INCR_U (r5 + 2 * n, n + 1, cy); + cy = r5[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r5 + 2 * n, n); +#endif + MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy); + + pp[10 * n]+= mpn_add_n (pp + 9 * n, pp + 9 * n, r3, n); + cy = mpn_add_1 (pp + 10 * n, r3 + n, n, pp[10 * n]); +#if HAVE_NATIVE_mpn_add_nc + cy = r3[n3] + mpn_add_nc(pp +11 * n, pp +11 * n, r3 + 2 * n, n, cy); +#else + MPN_INCR_U (r3 + 2 * n, n + 1, cy); + cy = r3[n3] + mpn_add_n (pp +11 * n, pp +11 * n, r3 + 2 * n, n); +#endif + MPN_INCR_U (pp +12 * n, 2 * n + 1, cy); + + pp[14 * n]+=mpn_add_n (pp +13 * n, pp +13 * n, r1, n); + if ( half ) { + cy = mpn_add_1 (pp + 14 * n, r1 + n, n, pp[14 * n]); +#if HAVE_NATIVE_mpn_add_nc + if(LIKELY(spt > n)) { + cy = r1[n3] + mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, n, cy); + MPN_INCR_U (pp + 16 * n, spt - n, cy); + } else { + ASSERT_NOCARRY(mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt, cy)); + } +#else + MPN_INCR_U (r1 + 2 * n, n + 1, cy); + if(LIKELY(spt > n)) { + cy = r1[n3] + mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, n); + MPN_INCR_U (pp + 16 * n, spt - n, cy); + } else { + ASSERT_NOCARRY(mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt)); + } +#endif + } else { + ASSERT_NOCARRY(mpn_add_1 (pp + 14 * n, r1 + n, spt, pp[14 * n])); + } + +#undef r0 +#undef r2 +#undef r4 +#undef r6 +} diff --git a/gmp-6.3.0/mpn/generic/toom_interpolate_5pts.c b/gmp-6.3.0/mpn/generic/toom_interpolate_5pts.c new file mode 100644 index 0000000..466ab85 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_interpolate_5pts.c @@ -0,0 +1,198 @@ +/* mpn_toom_interpolate_5pts -- Interpolate for toom3, 33, 42. + + Contributed to the GNU project by Robert Harley. + Improvements by Paul Zimmermann and Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2000-2003, 2005-2007, 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +void +mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1, + mp_size_t k, mp_size_t twor, int sa, + mp_limb_t vinf0) +{ + mp_limb_t cy, saved; + mp_size_t twok; + mp_size_t kk1; + mp_ptr c1, v1, c3, vinf; + + twok = k + k; + kk1 = twok + 1; + + c1 = c + k; + v1 = c1 + k; + c3 = v1 + k; + vinf = c3 + k; + +#define v0 (c) + /* (1) v2 <- v2-vm1 < v2+|vm1|, (16 8 4 2 1) - (1 -1 1 -1 1) = + thus 0 <= v2 < 50*B^(2k) < 2^6*B^(2k) (15 9 3 3 0) + */ + if (sa) + ASSERT_NOCARRY (mpn_add_n (v2, v2, vm1, kk1)); + else + ASSERT_NOCARRY (mpn_sub_n (v2, v2, vm1, kk1)); + + /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} + v0 v1 hi(vinf) |vm1| v2-vm1 EMPTY */ + + ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1)); /* v2 <- v2 / 3 */ + /* (5 3 1 1 0)*/ + + /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} + v0 v1 hi(vinf) |vm1| (v2-vm1)/3 EMPTY */ + + /* (2) vm1 <- tm1 := (v1 - vm1) / 2 [(1 1 1 1 1) - (1 -1 1 -1 1)] / 2 = + tm1 >= 0 (0 1 0 1 0) + No carry comes out from {v1, kk1} +/- {vm1, kk1}, + and the division by two is exact. + If (sa!=0) the sign of vm1 is negative */ + if (sa) + { +#ifdef HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (vm1, v1, vm1, kk1); +#else + ASSERT_NOCARRY (mpn_add_n (vm1, v1, vm1, kk1)); + ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1)); +#endif + } + else + { +#ifdef HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (vm1, v1, vm1, kk1); +#else + ASSERT_NOCARRY (mpn_sub_n (vm1, v1, vm1, kk1)); + ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1)); +#endif + } + + /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} + v0 v1 hi(vinf) tm1 (v2-vm1)/3 EMPTY */ + + /* (3) v1 <- t1 := v1 - v0 (1 1 1 1 1) - (0 0 0 0 1) = (1 1 1 1 0) + t1 >= 0 + */ + vinf[0] -= mpn_sub_n (v1, v1, c, twok); + + /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} + v0 v1-v0 hi(vinf) tm1 (v2-vm1)/3 EMPTY */ + + /* (4) v2 <- t2 := ((v2-vm1)/3-t1)/2 = (v2-vm1-3*t1)/6 + t2 >= 0 [(5 3 1 1 0) - (1 1 1 1 0)]/2 = (2 1 0 0 0) + */ +#ifdef HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (v2, v2, v1, kk1); +#else + ASSERT_NOCARRY (mpn_sub_n (v2, v2, v1, kk1)); + ASSERT_NOCARRY (mpn_rshift (v2, v2, kk1, 1)); +#endif + + /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r} + v0 v1-v0 hi(vinf) tm1 (v2-vm1-3t1)/6 EMPTY */ + + /* (5) v1 <- t1-tm1 (1 1 1 1 0) - (0 1 0 1 0) = (1 0 1 0 0) + result is v1 >= 0 + */ + ASSERT_NOCARRY (mpn_sub_n (v1, v1, vm1, kk1)); + + /* We do not need to read the value in vm1, so we add it in {c+k, ...} */ + cy = mpn_add_n (c1, c1, vm1, kk1); + MPN_INCR_U (c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */ + /* Memory allocated for vm1 is now free, it can be recycled ...*/ + + /* (6) v2 <- v2 - 2*vinf, (2 1 0 0 0) - 2*(1 0 0 0 0) = (0 1 0 0 0) + result is v2 >= 0 */ + saved = vinf[0]; /* Remember v1's highest byte (will be overwritten). */ + vinf[0] = vinf0; /* Set the right value for vinf0 */ +#ifdef HAVE_NATIVE_mpn_sublsh1_n_ip1 + cy = mpn_sublsh1_n_ip1 (v2, vinf, twor); +#else + /* Overwrite unused vm1 */ + cy = mpn_lshift (vm1, vinf, twor, 1); + cy += mpn_sub_n (v2, v2, vm1, twor); +#endif + MPN_DECR_U (v2 + twor, kk1 - twor, cy); + + /* Current matrix is + [1 0 0 0 0; vinf + 0 1 0 0 0; v2 + 1 0 1 0 0; v1 + 0 1 0 1 0; vm1 + 0 0 0 0 1] v0 + Some values already are in-place (we added vm1 in the correct position) + | vinf| v1 | v0 | + | vm1 | + One still is in a separated area + | +v2 | + We have to compute v1-=vinf; vm1 -= v2, + |-vinf| + | -v2 | + Carefully reordering operations we can avoid to compute twice the sum + of the high half of v2 plus the low half of vinf. + */ + + /* Add the high half of t2 in {vinf} */ + if ( LIKELY(twor > k + 1) ) { /* This is the expected flow */ + cy = mpn_add_n (vinf, vinf, v2 + k, k + 1); + MPN_INCR_U (c3 + kk1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */ + } else { /* triggered only by very unbalanced cases like + (k+k+(k-2))x(k+k+1) , should be handled by toom32 */ + ASSERT_NOCARRY (mpn_add_n (vinf, vinf, v2 + k, twor)); + } + /* (7) v1 <- v1 - vinf, (1 0 1 0 0) - (1 0 0 0 0) = (0 0 1 0 0) + result is >= 0 */ + /* Side effect: we also subtracted (high half) vm1 -= v2 */ + cy = mpn_sub_n (v1, v1, vinf, twor); /* vinf is at most twor long. */ + vinf0 = vinf[0]; /* Save again the right value for vinf0 */ + vinf[0] = saved; + MPN_DECR_U (v1 + twor, kk1 - twor, cy); /* Treat the last bytes. */ + + /* (8) vm1 <- vm1-v2 (0 1 0 1 0) - (0 1 0 0 0) = (0 0 0 1 0) + Operate only on the low half. + */ + cy = mpn_sub_n (c1, c1, v2, k); + MPN_DECR_U (v1, kk1, cy); + + /********************* Beginning the final phase **********************/ + + /* Most of the recomposition was done */ + + /* add t2 in {c+3k, ...}, but only the low half */ + cy = mpn_add_n (c3, c3, v2, k); + vinf[0] += cy; + ASSERT(vinf[0] >= cy); /* No carry */ + MPN_INCR_U (vinf, twor, vinf0); /* Add vinf0, propagate carry. */ + +#undef v0 +} diff --git a/gmp-6.3.0/mpn/generic/toom_interpolate_6pts.c b/gmp-6.3.0/mpn/generic/toom_interpolate_6pts.c new file mode 100644 index 0000000..eb23661 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_interpolate_6pts.c @@ -0,0 +1,241 @@ +/* mpn_toom_interpolate_6pts -- Interpolate for toom43, 52 + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2010, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#define BINVERT_3 MODLIMB_INVERSE_3 + +/* For odd divisors, mpn_divexact_1 works fine with two's complement. */ +#ifndef mpn_divexact_by3 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0) +#else +#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3) +#endif +#endif + +/* Interpolation for Toom-3.5, using the evaluation points: infinity, + 1, -1, 2, -2. More precisely, we want to compute + f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the + six values + + w5 = f(0), + w4 = f(-1), + w3 = f(1) + w2 = f(-2), + w1 = f(2), + w0 = limit at infinity of f(x) / x^5, + + The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at + {pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at + {pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most + significant limbs small). f(-1) and f(-2) may be negative, signs + determined by the flag bits. All intermediate results are positive. + Inputs are destroyed. + + Interpolation sequence was taken from the paper: "Integer and + Polynomial Multiplication: Towards Optimal Toom-Cook Matrices". + Some slight variations were introduced: adaptation to "gmp + instruction set", and a final saving of an operation by interlacing + interpolation and recomposition phases. +*/ + +void +mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags, + mp_ptr w4, mp_ptr w2, mp_ptr w1, + mp_size_t w0n) +{ + mp_limb_t cy; + /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */ + mp_limb_t cy4, cy6, embankment; + + ASSERT( n > 0 ); + ASSERT( 2*n >= w0n && w0n > 0 ); + +#define w5 pp /* 2n */ +#define w3 (pp + 2 * n) /* 2n+1 */ +#define w0 (pp + 5 * n) /* w0n */ + + /* Interpolate with sequence: + W2 =(W1 - W2)>>2 + W1 =(W1 - W5)>>1 + W1 =(W1 - W2)>>1 + W4 =(W3 - W4)>>1 + W2 =(W2 - W4)/3 + W3 = W3 - W4 - W5 + W1 =(W1 - W3)/3 + // Last steps are mixed with recomposition... + W2 = W2 - W0<<2 + W4 = W4 - W2 + W3 = W3 - W1 + W2 = W2 - W0 + */ + + /* W2 =(W1 - W2)>>2 */ + if (flags & toom6_vm2_neg) + mpn_add_n (w2, w1, w2, 2 * n + 1); + else + mpn_sub_n (w2, w1, w2, 2 * n + 1); + mpn_rshift (w2, w2, 2 * n + 1, 2); + + /* W1 =(W1 - W5)>>1 */ + w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n); + mpn_rshift (w1, w1, 2 * n + 1, 1); + + /* W1 =(W1 - W2)>>1 */ +#if HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1); +#else + mpn_sub_n (w1, w1, w2, 2 * n + 1); + mpn_rshift (w1, w1, 2 * n + 1, 1); +#endif + + /* W4 =(W3 - W4)>>1 */ + if (flags & toom6_vm1_neg) + { +#if HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (w4, w3, w4, 2 * n + 1); +#else + mpn_add_n (w4, w3, w4, 2 * n + 1); + mpn_rshift (w4, w4, 2 * n + 1, 1); +#endif + } + else + { +#if HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1); +#else + mpn_sub_n (w4, w3, w4, 2 * n + 1); + mpn_rshift (w4, w4, 2 * n + 1, 1); +#endif + } + + /* W2 =(W2 - W4)/3 */ + mpn_sub_n (w2, w2, w4, 2 * n + 1); + mpn_divexact_by3 (w2, w2, 2 * n + 1); + + /* W3 = W3 - W4 - W5 */ + mpn_sub_n (w3, w3, w4, 2 * n + 1); + w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n); + + /* W1 =(W1 - W3)/3 */ + mpn_sub_n (w1, w1, w3, 2 * n + 1); + mpn_divexact_by3 (w1, w1, 2 * n + 1); + + /* + [1 0 0 0 0 0; + 0 1 0 0 0 0; + 1 0 1 0 0 0; + 0 1 0 1 0 0; + 1 0 1 0 1 0; + 0 0 0 0 0 1] + + pp[] prior to operations: + |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__| + + summation scheme for remaining operations: + |______________5|n_____4|n_____3|n_____2|n______|n______|pp + |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__| + || H w4 | L w4 | + || H w2 | L w2 | + || H w1 | L w1 | + ||-H w1 |-L w1 | + |-H w0 |-L w0 ||-H w2 |-L w2 | + */ + cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1); + MPN_INCR_U (pp + 3 * n + 1, n, cy); + + /* W2 -= W0<<2 */ +#if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n_ip1 +#if HAVE_NATIVE_mpn_sublsh2_n_ip1 + cy = mpn_sublsh2_n_ip1 (w2, w0, w0n); +#else + cy = mpn_sublsh_n (w2, w2, w0, w0n, 2); +#endif +#else + /* {W4,2*n+1} is now free and can be overwritten. */ + cy = mpn_lshift(w4, w0, w0n, 2); + cy+= mpn_sub_n(w2, w2, w4, w0n); +#endif + MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy); + + /* W4L = W4L - W2L */ + cy = mpn_sub_n (pp + n, pp + n, w2, n); + MPN_DECR_U (w3, 2 * n + 1, cy); + + /* W3H = W3H + W2L */ + cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n); + /* W1L + W2H */ + cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n); + MPN_INCR_U (w1 + n, n + 1, cy); + + /* W0 = W0 + W1H */ + if (LIKELY (w0n > n)) + cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n); + else + cy6 = mpn_add_n (w0, w0, w1 + n, w0n); + + /* + summation scheme for the next operation: + |...____5|n_____4|n_____3|n_____2|n______|n______|pp + |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__| + ...-w0___|-w1_w2 | + */ + /* if(LIKELY(w0n>n)) the two operands below DO overlap! */ + cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n); + + /* embankment is a "dirty trick" to avoid carry/borrow propagation + beyond allocated memory */ + embankment = w0[w0n - 1] - 1; + w0[w0n - 1] = 1; + if (LIKELY (w0n > n)) { + if (cy4 > cy6) + MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6); + else + MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4); + MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy); + MPN_INCR_U (w0 + n, w0n - n, cy6); + } else { + MPN_INCR_U (pp + 4 * n, w0n + n, cy4); + MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6); + } + w0[w0n - 1] += embankment; + +#undef w5 +#undef w3 +#undef w0 + +} diff --git a/gmp-6.3.0/mpn/generic/toom_interpolate_7pts.c b/gmp-6.3.0/mpn/generic/toom_interpolate_7pts.c new file mode 100644 index 0000000..167c45b --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_interpolate_7pts.c @@ -0,0 +1,274 @@ +/* mpn_toom_interpolate_7pts -- Interpolate for toom44, 53, 62. + + Contributed to the GNU project by Niels Möller. + Improvements by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2006, 2007, 2009, 2014, 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#define BINVERT_3 MODLIMB_INVERSE_3 + +#define BINVERT_9 \ + ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39) + +#define BINVERT_15 \ + ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15) + +/* For the various mpn_divexact_byN here, fall back to using either + mpn_pi1_bdiv_q_1 or mpn_divexact_1. The former has less overhead and is + many faster if it is native. For now, since mpn_divexact_1 is native on + several platforms where mpn_pi1_bdiv_q_1 does not yet exist, do not use + mpn_pi1_bdiv_q_1 unconditionally. FIXME. */ + +/* For odd divisors, mpn_divexact_1 works fine with two's complement. */ +#ifndef mpn_divexact_by3 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0) +#else +#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3) +#endif +#endif + +#ifndef mpn_divexact_by9 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by9(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,9,BINVERT_9,0) +#else +#define mpn_divexact_by9(dst,src,size) mpn_divexact_1(dst,src,size,9) +#endif +#endif + +#ifndef mpn_divexact_by15 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by15(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,15,BINVERT_15,0) +#else +#define mpn_divexact_by15(dst,src,size) mpn_divexact_1(dst,src,size,15) +#endif +#endif + +/* Interpolation for toom4, using the evaluation points 0, infinity, + 1, -1, 2, -2, 1/2. More precisely, we want to compute + f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 6, given the + seven values + + w0 = f(0), + w1 = f(-2), + w2 = f(1), + w3 = f(-1), + w4 = f(2) + w5 = 64 * f(1/2) + w6 = limit at infinity of f(x) / x^6, + + The result is 6*n + w6n limbs. At entry, w0 is stored at {rp, 2n }, + w2 is stored at { rp + 2n, 2n+1 }, and w6 is stored at { rp + 6n, + w6n }. The other values are 2n + 1 limbs each (with most + significant limbs small). f(-1) and f(-1/2) may be negative, signs + determined by the flag bits. Inputs are destroyed. + + Needs (2*n + 1) limbs of temporary storage. +*/ + +void +mpn_toom_interpolate_7pts (mp_ptr rp, mp_size_t n, enum toom7_flags flags, + mp_ptr w1, mp_ptr w3, mp_ptr w4, mp_ptr w5, + mp_size_t w6n, mp_ptr tp) +{ + mp_size_t m; + mp_limb_t cy; + + m = 2*n + 1; +#define w0 rp +#define w2 (rp + 2*n) +#define w6 (rp + 6*n) + + ASSERT (w6n > 0); + ASSERT (w6n <= 2*n); + + /* Using formulas similar to Marco Bodrato's + + W5 = W5 + W4 + W1 =(W4 - W1)/2 + W4 = W4 - W0 + W4 =(W4 - W1)/4 - W6*16 + W3 =(W2 - W3)/2 + W2 = W2 - W3 + + W5 = W5 - W2*65 May be negative. + W2 = W2 - W6 - W0 + W5 =(W5 + W2*45)/2 Now >= 0 again. + W4 =(W4 - W2)/3 + W2 = W2 - W4 + + W1 = W5 - W1 May be negative. + W5 =(W5 - W3*8)/9 + W3 = W3 - W5 + W1 =(W1/15 + W5)/2 Now >= 0 again. + W5 = W5 - W1 + + where W0 = f(0), W1 = f(-2), W2 = f(1), W3 = f(-1), + W4 = f(2), W5 = f(1/2), W6 = f(oo), + + Note that most intermediate results are positive; the ones that + may be negative are represented in two's complement. We must + never shift right a value that may be negative, since that would + invalidate the sign bit. On the other hand, divexact by odd + numbers work fine with two's complement. + */ + + mpn_add_n (w5, w5, w4, m); + if (flags & toom7_w1_neg) + { +#ifdef HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (w1, w1, w4, m); +#else + mpn_add_n (w1, w1, w4, m); ASSERT (!(w1[0] & 1)); + mpn_rshift (w1, w1, m, 1); +#endif + } + else + { +#ifdef HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (w1, w4, w1, m); +#else + mpn_sub_n (w1, w4, w1, m); ASSERT (!(w1[0] & 1)); + mpn_rshift (w1, w1, m, 1); +#endif + } + mpn_sub (w4, w4, m, w0, 2*n); + mpn_sub_n (w4, w4, w1, m); ASSERT (!(w4[0] & 3)); + mpn_rshift (w4, w4, m, 2); /* w4>=0 */ + + tp[w6n] = mpn_lshift (tp, w6, w6n, 4); + mpn_sub (w4, w4, m, tp, w6n+1); + + if (flags & toom7_w3_neg) + { +#ifdef HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (w3, w3, w2, m); +#else + mpn_add_n (w3, w3, w2, m); ASSERT (!(w3[0] & 1)); + mpn_rshift (w3, w3, m, 1); +#endif + } + else + { +#ifdef HAVE_NATIVE_mpn_rsh1sub_n + mpn_rsh1sub_n (w3, w2, w3, m); +#else + mpn_sub_n (w3, w2, w3, m); ASSERT (!(w3[0] & 1)); + mpn_rshift (w3, w3, m, 1); +#endif + } + + mpn_sub_n (w2, w2, w3, m); + + mpn_submul_1 (w5, w2, m, 65); + mpn_sub (w2, w2, m, w6, w6n); + mpn_sub (w2, w2, m, w0, 2*n); + + mpn_addmul_1 (w5, w2, m, 45); ASSERT (!(w5[0] & 1)); + mpn_rshift (w5, w5, m, 1); + mpn_sub_n (w4, w4, w2, m); + + mpn_divexact_by3 (w4, w4, m); + mpn_sub_n (w2, w2, w4, m); + + mpn_sub_n (w1, w5, w1, m); + mpn_lshift (tp, w3, m, 3); + mpn_sub_n (w5, w5, tp, m); + mpn_divexact_by9 (w5, w5, m); + mpn_sub_n (w3, w3, w5, m); + + mpn_divexact_by15 (w1, w1, m); +#ifdef HAVE_NATIVE_mpn_rsh1add_n + mpn_rsh1add_n (w1, w1, w5, m); + w1[m - 1] &= GMP_NUMB_MASK >> 1; +#else + mpn_add_n (w1, w1, w5, m); ASSERT (!(w1[0] & 1)); + mpn_rshift (w1, w1, m, 1); /* w1>=0 now */ +#endif + + mpn_sub_n (w5, w5, w1, m); + + /* These bounds are valid for the 4x4 polynomial product of toom44, + * and they are conservative for toom53 and toom62. */ + ASSERT (w1[2*n] < 2); + ASSERT (w2[2*n] < 3); + ASSERT (w3[2*n] < 4); + ASSERT (w4[2*n] < 3); + ASSERT (w5[2*n] < 2); + + /* Addition chain. Note carries and the 2n'th limbs that need to be + * added in. + * + * Special care is needed for w2[2n] and the corresponding carry, + * since the "simple" way of adding it all together would overwrite + * the limb at wp[2*n] and rp[4*n] (same location) with the sum of + * the high half of w3 and the low half of w4. + * + * 7 6 5 4 3 2 1 0 + * | | | | | | | | | + * ||w3 (2n+1)| + * ||w4 (2n+1)| + * ||w5 (2n+1)| ||w1 (2n+1)| + * + | w6 (w6n)| ||w2 (2n+1)| w0 (2n) | (share storage with r) + * ----------------------------------------------- + * r | | | | | | | | | + * c7 c6 c5 c4 c3 Carries to propagate + */ + + cy = mpn_add_n (rp + n, rp + n, w1, m); + MPN_INCR_U (w2 + n + 1, n , cy); + cy = mpn_add_n (rp + 3*n, rp + 3*n, w3, n); + MPN_INCR_U (w3 + n, n + 1, w2[2*n] + cy); + cy = mpn_add_n (rp + 4*n, w3 + n, w4, n); + MPN_INCR_U (w4 + n, n + 1, w3[2*n] + cy); + cy = mpn_add_n (rp + 5*n, w4 + n, w5, n); + MPN_INCR_U (w5 + n, n + 1, w4[2*n] + cy); + if (w6n > n + 1) + { + cy = mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, n + 1); + MPN_INCR_U (rp + 7*n + 1, w6n - n - 1, cy); + } + else + { + ASSERT_NOCARRY (mpn_add_n (rp + 6*n, rp + 6*n, w5 + n, w6n)); +#if WANT_ASSERT + { + mp_size_t i; + for (i = w6n; i <= n; i++) + ASSERT (w5[n + i] == 0); + } +#endif + } +} diff --git a/gmp-6.3.0/mpn/generic/toom_interpolate_8pts.c b/gmp-6.3.0/mpn/generic/toom_interpolate_8pts.c new file mode 100644 index 0000000..5e65fab --- /dev/null +++ b/gmp-6.3.0/mpn/generic/toom_interpolate_8pts.c @@ -0,0 +1,211 @@ +/* mpn_toom_interpolate_8pts -- Interpolate for toom54, 63, 72. + + Contributed to the GNU project by Marco Bodrato. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +#define BINVERT_3 MODLIMB_INVERSE_3 + +#define BINVERT_15 \ + ((((GMP_NUMB_MAX >> (GMP_NUMB_BITS % 4)) / 15) * 14 * 16 & GMP_NUMB_MAX) + 15) + +#define BINVERT_45 ((BINVERT_15 * BINVERT_3) & GMP_NUMB_MASK) + +#ifndef mpn_divexact_by3 +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0) +#else +#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3) +#endif +#endif + +#ifndef mpn_divexact_by45 +#if GMP_NUMB_BITS % 12 == 0 +#define mpn_divexact_by45(dst,src,size) \ + (63 & 19 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 45))) +#else +#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 +#define mpn_divexact_by45(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,45,BINVERT_45,0) +#else +#define mpn_divexact_by45(dst,src,size) mpn_divexact_1(dst,src,size,45) +#endif +#endif +#endif + +#if HAVE_NATIVE_mpn_sublsh2_n_ip1 +#define DO_mpn_sublsh2_n(dst,src,n,ws) mpn_sublsh2_n_ip1(dst,src,n) +#else +#define DO_mpn_sublsh2_n(dst,src,n,ws) DO_mpn_sublsh_n(dst,src,n,2,ws) +#endif + +#if HAVE_NATIVE_mpn_sublsh_n +#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n (dst,dst,src,n,s) +#else +static mp_limb_t +DO_mpn_sublsh_n (mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws) +{ +#if USE_MUL_1 && 0 + return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s)); +#else + mp_limb_t __cy; + __cy = mpn_lshift (ws,src,n,s); + return __cy + mpn_sub_n (dst,dst,ws,n); +#endif +} +#endif + + +#if HAVE_NATIVE_mpn_subrsh +#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh (dst,nd,src,ns,s) +#else +/* This is not a correct definition, it assumes no carry */ +#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \ +do { \ + mp_limb_t __cy; \ + MPN_DECR_U (dst, nd, src[0] >> s); \ + __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \ + MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \ +} while (0) +#endif + +/* Interpolation for Toom-4.5 (or Toom-4), using the evaluation + points: infinity(4.5 only), 4, -4, 2, -2, 1, -1, 0. More precisely, + we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of + degree 7 (or 6), given the 8 (rsp. 7) values: + + r1 = limit at infinity of f(x) / x^7, + r2 = f(4), + r3 = f(-4), + r4 = f(2), + r5 = f(-2), + r6 = f(1), + r7 = f(-1), + r8 = f(0). + + All couples of the form f(n),f(-n) must be already mixed with + toom_couple_handling(f(n),...,f(-n),...) + + The result is stored in {pp, spt + 7*n (or 6*n)}. + At entry, r8 is stored at {pp, 2n}, + r5 is stored at {pp + 3n, 3n + 1}. + + The other values are 2n+... limbs each (with most significant limbs small). + + All intermediate results are positive. + Inputs are destroyed. +*/ + +void +mpn_toom_interpolate_8pts (mp_ptr pp, mp_size_t n, + mp_ptr r3, mp_ptr r7, + mp_size_t spt, mp_ptr ws) +{ + mp_limb_signed_t cy; + mp_ptr r5, r1; + r5 = (pp + 3 * n); /* 3n+1 */ + r1 = (pp + 7 * n); /* spt */ + + /******************************* interpolation *****************************/ + + DO_mpn_subrsh(r3+n, 2 * n + 1, pp, 2 * n, 4, ws); + cy = DO_mpn_sublsh_n (r3, r1, spt, 12, ws); + MPN_DECR_U (r3 + spt, 3 * n + 1 - spt, cy); + + DO_mpn_subrsh(r5+n, 2 * n + 1, pp, 2 * n, 2, ws); + cy = DO_mpn_sublsh_n (r5, r1, spt, 6, ws); + MPN_DECR_U (r5 + spt, 3 * n + 1 - spt, cy); + + r7[3*n] -= mpn_sub_n (r7+n, r7+n, pp, 2 * n); + cy = mpn_sub_n (r7, r7, r1, spt); + MPN_DECR_U (r7 + spt, 3 * n + 1 - spt, cy); + + ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1)); + ASSERT_NOCARRY(mpn_rshift(r3, r3, 3 * n + 1, 2)); + + ASSERT_NOCARRY(mpn_sub_n (r5, r5, r7, 3 * n + 1)); + + ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, 3 * n + 1)); + + mpn_divexact_by45 (r3, r3, 3 * n + 1); + + ASSERT_NOCARRY(mpn_divexact_by3 (r5, r5, 3 * n + 1)); + + ASSERT_NOCARRY(DO_mpn_sublsh2_n (r5, r3, 3 * n + 1, ws)); + + /* last interpolation steps... */ + /* ... are mixed with recomposition */ + + /***************************** recomposition *******************************/ + /* + pp[] prior to operations: + |_H r1|_L r1|____||_H r5|_M_r5|_L r5|_____|_H r8|_L r8|pp + + summation scheme for remaining operations: + |____8|n___7|n___6|n___5|n___4|n___3|n___2|n____|n____|pp + |_H r1|_L r1|____||_H*r5|_M r5|_L r5|_____|_H_r8|_L r8|pp + ||_H r3|_M r3|_L*r3| + ||_H_r7|_M_r7|_L_r7| + ||-H r3|-M r3|-L*r3| + ||-H*r5|-M_r5|-L_r5| + */ + + cy = mpn_add_n (pp + n, pp + n, r7, n); /* Hr8+Lr7-Lr5 */ + cy-= mpn_sub_n (pp + n, pp + n, r5, n); + if (cy > 0) { + MPN_INCR_U (r7 + n, 2*n + 1, 1); + cy = 0; + } + + cy = mpn_sub_nc (pp + 2*n, r7 + n, r5 + n, n, -cy); /* Mr7-Mr5 */ + MPN_DECR_U (r7 + 2*n, n + 1, cy); + + cy = mpn_add_n (pp + 3*n, r5, r7+ 2*n, n+1); /* Hr7+Lr5 */ + r5[3*n]+= mpn_add_n (r5 + 2*n, r5 + 2*n, r3, n); /* Hr5+Lr3 */ + cy-= mpn_sub_n (pp + 3*n, pp + 3*n, r5 + 2*n, n+1); /* Hr7-Hr5+Lr5-Lr3 */ + if (UNLIKELY(0 > cy)) + MPN_DECR_U (r5 + n + 1, 2*n, 1); + else + MPN_INCR_U (r5 + n + 1, 2*n, cy); + + ASSERT_NOCARRY(mpn_sub_n(pp + 4*n, r5 + n, r3 + n, 2*n +1)); /* Mr5-Mr3,Hr5-Hr3 */ + + cy = mpn_add_1 (pp + 6*n, r3 + n, n, pp[6*n]); + MPN_INCR_U (r3 + 2*n, n + 1, cy); + cy = mpn_add_n (pp + 7*n, pp + 7*n, r3 + 2*n, n); + if (LIKELY(spt != n)) + MPN_INCR_U (pp + 8*n, spt - n, cy + r3[3*n]); + else + ASSERT (r3[3*n] + cy == 0); +} diff --git a/gmp-6.3.0/mpn/generic/trialdiv.c b/gmp-6.3.0/mpn/generic/trialdiv.c new file mode 100644 index 0000000..65e089f --- /dev/null +++ b/gmp-6.3.0/mpn/generic/trialdiv.c @@ -0,0 +1,131 @@ +/* mpn_trialdiv -- find small factors of an mpn number using trial division. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST + GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. + +Copyright 2009, 2010, 2012, 2013 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +/* + This function finds the first (smallest) factor represented in + trialdivtab.h. It does not stop the factoring effort just because it has + reached some sensible limit, such as the square root of the input number. + + The caller can limit the factoring effort by passing NPRIMES. The function + will then divide until that limit, or perhaps a few primes more. A position + which only mpn_trialdiv can make sense of is returned in the WHERE + parameter. It can be used for restarting the factoring effort; the first + call should pass 0 here. + + Input: 1. A non-negative number T = {tp,tn} + 2. NPRIMES as described above, + 3. *WHERE as described above. + Output: 1. *WHERE updated as described above. + 2. Return value is non-zero if we found a factor, else zero + To get the actual prime factor, compute the mod B inverse + of the return value. +*/ + +#include "gmp-impl.h" + +struct gmp_primes_dtab { + mp_limb_t binv; + mp_limb_t lim; +}; + +struct gmp_primes_ptab { + mp_limb_t ppp; /* primes, multiplied together */ + mp_limb_t cps[7]; /* ppp values pre-computed for mpn_mod_1s_4p */ + gmp_uint_least32_t idx:24; /* index of first primes in dtab */ + gmp_uint_least32_t np :8; /* number of primes related to this entry */ +}; + + +static const struct gmp_primes_dtab gmp_primes_dtab[] = +{ +#define WANT_dtab +#define P(p,inv,lim) {inv,lim} +#include "trialdivtab.h" +#undef WANT_dtab +#undef P + {0,0} +}; + +static const struct gmp_primes_ptab gmp_primes_ptab[] = +{ +#define WANT_ptab +#include "trialdivtab.h" +#undef WANT_ptab +}; + +#define PTAB_LINES (sizeof (gmp_primes_ptab) / sizeof (gmp_primes_ptab[0])) + +/* FIXME: We could optimize out one of the outer loop conditions if we + had a final ptab entry with a huge np field. */ +mp_limb_t +mpn_trialdiv (mp_srcptr tp, mp_size_t tn, mp_size_t nprimes, int *where) +{ + mp_limb_t ppp; + const mp_limb_t *cps; + const struct gmp_primes_dtab *dp; + long i, j, idx, np; + mp_limb_t r, q; + + ASSERT (tn >= 1); + + for (i = *where; i < PTAB_LINES; i++) + { + ppp = gmp_primes_ptab[i].ppp; + cps = gmp_primes_ptab[i].cps; + + r = mpn_mod_1s_4p (tp, tn, ppp << cps[1], cps); + + idx = gmp_primes_ptab[i].idx; + np = gmp_primes_ptab[i].np; + + /* Check divisibility by individual primes. */ + dp = &gmp_primes_dtab[idx] + np; + for (j = -np; j < 0; j++) + { + q = r * dp[j].binv; + if (q <= dp[j].lim) + { + *where = i; + return dp[j].binv; + } + } + + nprimes -= np; + if (nprimes <= 0) + return 0; + } + return 0; +} diff --git a/gmp-6.3.0/mpn/generic/udiv_w_sdiv.c b/gmp-6.3.0/mpn/generic/udiv_w_sdiv.c new file mode 100644 index 0000000..7907135 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/udiv_w_sdiv.c @@ -0,0 +1,141 @@ +/* mpn_udiv_w_sdiv -- implement udiv_qrnnd on machines with only signed + division. + + Contributed by Peter L. Montgomery. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY SAFE + TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS + ALMOST GUARANTEED THAT THIS FUNCTION WILL CHANGE OR DISAPPEAR IN A FUTURE + GNU MP RELEASE. + + +Copyright 1992, 1994, 1996, 2000, 2011, 2012 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "longlong.h" + +mp_limb_t +mpn_udiv_w_sdiv (mp_limb_t *rp, mp_limb_t a1, mp_limb_t a0, mp_limb_t d) +{ + mp_limb_t q, r; + mp_limb_t c0, c1, b1; + + ASSERT (d != 0); + ASSERT (a1 < d); + + if ((mp_limb_signed_t) d >= 0) + { + if (a1 < d - a1 - (a0 >> (GMP_LIMB_BITS - 1))) + { + /* dividend, divisor, and quotient are nonnegative */ + sdiv_qrnnd (q, r, a1, a0, d); + } + else + { + /* Compute c1*2^32 + c0 = a1*2^32 + a0 - 2^31*d */ + sub_ddmmss (c1, c0, a1, a0, d >> 1, d << (GMP_LIMB_BITS - 1)); + /* Divide (c1*2^32 + c0) by d */ + sdiv_qrnnd (q, r, c1, c0, d); + /* Add 2^31 to quotient */ + q += (mp_limb_t) 1 << (GMP_LIMB_BITS - 1); + } + } + else + { + b1 = d >> 1; /* d/2, between 2^30 and 2^31 - 1 */ + c1 = a1 >> 1; /* A/2 */ + c0 = (a1 << (GMP_LIMB_BITS - 1)) + (a0 >> 1); + + if (a1 < b1) /* A < 2^32*b1, so A/2 < 2^31*b1 */ + { + sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */ + + r = 2*r + (a0 & 1); /* Remainder from A/(2*b1) */ + if ((d & 1) != 0) + { + if (r >= q) + r = r - q; + else if (q - r <= d) + { + r = r - q + d; + q--; + } + else + { + r = r - q + 2*d; + q -= 2; + } + } + } + else if (c1 < b1) /* So 2^31 <= (A/2)/b1 < 2^32 */ + { + c1 = (b1 - 1) - c1; + c0 = ~c0; /* logical NOT */ + + sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */ + + q = ~q; /* (A/2)/b1 */ + r = (b1 - 1) - r; + + r = 2*r + (a0 & 1); /* A/(2*b1) */ + + if ((d & 1) != 0) + { + if (r >= q) + r = r - q; + else if (q - r <= d) + { + r = r - q + d; + q--; + } + else + { + r = r - q + 2*d; + q -= 2; + } + } + } + else /* Implies c1 = b1 */ + { /* Hence a1 = d - 1 = 2*b1 - 1 */ + if (a0 >= -d) + { + q = -CNST_LIMB(1); + r = a0 + d; + } + else + { + q = -CNST_LIMB(2); + r = a0 + 2*d; + } + } + } + + *rp = r; + return q; +} diff --git a/gmp-6.3.0/mpn/generic/zero.c b/gmp-6.3.0/mpn/generic/zero.c new file mode 100644 index 0000000..1a05453 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/zero.c @@ -0,0 +1,41 @@ +/* mpn_zero + +Copyright 2009 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" + +void +mpn_zero (mp_ptr rp, mp_size_t n) +{ + mp_size_t i; + + rp += n; + for (i = -n; i != 0; i++) + rp[i] = 0; +} diff --git a/gmp-6.3.0/mpn/generic/zero_p.c b/gmp-6.3.0/mpn/generic/zero_p.c new file mode 100644 index 0000000..c92f9b8 --- /dev/null +++ b/gmp-6.3.0/mpn/generic/zero_p.c @@ -0,0 +1,33 @@ +/* mpn_zero_p (x,xsize) -- Return 1 if X is zero, 0 if it is non-zero. + +Copyright 2015 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#define __GMP_FORCE_mpn_zero_p 1 + +#include "gmp-impl.h" -- cgit v1.2.3